| OLD | NEW |
| 1 ;***************************************************************************** | 1 ;***************************************************************************** |
| 2 ;* x86inc.asm: x264asm abstraction layer | 2 ;* x86inc.asm: x264asm abstraction layer |
| 3 ;***************************************************************************** | 3 ;***************************************************************************** |
| 4 ;* Copyright (C) 2005-2012 x264 project | 4 ;* Copyright (C) 2005-2015 x264 project |
| 5 ;* | 5 ;* |
| 6 ;* Authors: Loren Merritt <lorenm@u.washington.edu> | 6 ;* Authors: Loren Merritt <lorenm@u.washington.edu> |
| 7 ;* Anton Mitrofanov <BugMaster@narod.ru> | 7 ;* Anton Mitrofanov <BugMaster@narod.ru> |
| 8 ;* Jason Garrett-Glaser <darkshikari@gmail.com> | 8 ;* Fiona Glaser <fiona@x264.com> |
| 9 ;* Henrik Gramner <hengar-6@student.ltu.se> | 9 ;* Henrik Gramner <henrik@gramner.com> |
| 10 ;* | 10 ;* |
| 11 ;* Permission to use, copy, modify, and/or distribute this software for any | 11 ;* Permission to use, copy, modify, and/or distribute this software for any |
| 12 ;* purpose with or without fee is hereby granted, provided that the above | 12 ;* purpose with or without fee is hereby granted, provided that the above |
| 13 ;* copyright notice and this permission notice appear in all copies. | 13 ;* copyright notice and this permission notice appear in all copies. |
| 14 ;* | 14 ;* |
| 15 ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | 15 ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
| 16 ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | 16 ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
| 17 ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | 17 ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
| 18 ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | 18 ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| 19 ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | 19 ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
| 20 ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | 20 ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
| 21 ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | 21 ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| 22 ;***************************************************************************** | 22 ;***************************************************************************** |
| 23 | 23 |
| 24 ; This is a header file for the x264ASM assembly language, which uses | 24 ; This is a header file for the x264ASM assembly language, which uses |
| 25 ; NASM/YASM syntax combined with a large number of macros to provide easy | 25 ; NASM/YASM syntax combined with a large number of macros to provide easy |
| 26 ; abstraction between different calling conventions (x86_32, win64, linux64). | 26 ; abstraction between different calling conventions (x86_32, win64, linux64). |
| 27 ; It also has various other useful features to simplify writing the kind of | 27 ; It also has various other useful features to simplify writing the kind of |
| 28 ; DSP functions that are most often used in x264. | 28 ; DSP functions that are most often used in x264. |
| 29 | 29 |
| 30 ; Unlike the rest of x264, this file is available under an ISC license, as it | 30 ; Unlike the rest of x264, this file is available under an ISC license, as it |
| 31 ; has significant usefulness outside of x264 and we want it to be available | 31 ; has significant usefulness outside of x264 and we want it to be available |
| 32 ; to the largest audience possible. Of course, if you modify it for your own | 32 ; to the largest audience possible. Of course, if you modify it for your own |
| 33 ; purposes to add a new feature, we strongly encourage contributing a patch | 33 ; purposes to add a new feature, we strongly encourage contributing a patch |
| 34 ; as this feature might be useful for others as well. Send patches or ideas | 34 ; as this feature might be useful for others as well. Send patches or ideas |
| 35 ; to x264-devel@videolan.org . | 35 ; to x264-devel@videolan.org . |
| 36 | 36 |
| 37 %include "vpx_config.asm" | 37 %include "vpx_config.asm" |
| 38 | 38 |
| 39 %ifndef program_name | 39 %ifndef private_prefix |
| 40 %define program_name vp9 | 40 %define private_prefix vpx |
| 41 %endif | 41 %endif |
| 42 | 42 |
| 43 %ifndef public_prefix |
| 44 %define public_prefix private_prefix |
| 45 %endif |
| 43 | 46 |
| 47 %ifndef STACK_ALIGNMENT |
| 48 %if ARCH_X86_64 |
| 49 %define STACK_ALIGNMENT 16 |
| 50 %else |
| 51 %define STACK_ALIGNMENT 4 |
| 52 %endif |
| 53 %endif |
| 54 |
| 55 %define WIN64 0 |
| 44 %define UNIX64 0 | 56 %define UNIX64 0 |
| 45 %define WIN64 0 | |
| 46 %if ARCH_X86_64 | 57 %if ARCH_X86_64 |
| 47 %ifidn __OUTPUT_FORMAT__,win32 | 58 %ifidn __OUTPUT_FORMAT__,win32 |
| 48 %define WIN64 1 | 59 %define WIN64 1 |
| 49 %elifidn __OUTPUT_FORMAT__,win64 | 60 %elifidn __OUTPUT_FORMAT__,win64 |
| 50 %define WIN64 1 | 61 %define WIN64 1 |
| 51 %elifidn __OUTPUT_FORMAT__,x64 | 62 %elifidn __OUTPUT_FORMAT__,x64 |
| 52 %define WIN64 1 | 63 %define WIN64 1 |
| 53 %else | 64 %else |
| 54 %define UNIX64 1 | 65 %define UNIX64 1 |
| 55 %endif | 66 %endif |
| 56 %endif | 67 %endif |
| 57 | 68 |
| 58 %ifidn __OUTPUT_FORMAT__,elf32 | 69 %ifidn __OUTPUT_FORMAT__,elf32 |
| 59 %define mangle(x) x | 70 %define mangle(x) x |
| 60 %elifidn __OUTPUT_FORMAT__,elf64 | 71 %elifidn __OUTPUT_FORMAT__,elf64 |
| 61 %define mangle(x) x | 72 %define mangle(x) x |
| 62 %elifidn __OUTPUT_FORMAT__,elf | |
| 63 %define mangle(x) x | |
| 64 %elifidn __OUTPUT_FORMAT__,x64 | 73 %elifidn __OUTPUT_FORMAT__,x64 |
| 65 %define mangle(x) x | 74 %define mangle(x) x |
| 66 %elifidn __OUTPUT_FORMAT__,win64 | 75 %elifidn __OUTPUT_FORMAT__,win64 |
| 67 %define mangle(x) x | 76 %define mangle(x) x |
| 68 %else | 77 %else |
| 69 %define mangle(x) _ %+ x | 78 %define mangle(x) _ %+ x |
| 70 %endif | 79 %endif |
| 71 | 80 |
| 72 ; FIXME: All of the 64bit asm functions that take a stride as an argument | 81 ; In some instances macho32 tables get misaligned when using .rodata. |
| 73 ; via register, assume that the high dword of that register is filled with 0. | 82 ; When looking at the disassembly it appears that the offset is either |
| 74 ; This is true in practice (since we never do any 64bit arithmetic on strides, | 83 ; correct or consistently off by 90. Placing them in the .text section |
| 75 ; and x264's strides are all positive), but is not guaranteed by the ABI. | 84 ; works around the issue. It appears to be specific to the way libvpx |
| 76 | 85 ; handles the tables. |
| 77 ; Name of the .rodata section. | |
| 78 ; Kludge: Something on OS X fails to align .rodata even given an align attribute
, | |
| 79 ; so use a different read-only section. | |
| 80 %macro SECTION_RODATA 0-1 16 | 86 %macro SECTION_RODATA 0-1 16 |
| 81 %ifidn __OUTPUT_FORMAT__,macho64 | 87 %ifidn __OUTPUT_FORMAT__,macho32 |
| 82 SECTION .text align=%1 | |
| 83 %elifidn __OUTPUT_FORMAT__,macho32 | |
| 84 SECTION .text align=%1 | |
| 85 fakegot: | |
| 86 %elifidn __OUTPUT_FORMAT__,macho | |
| 87 SECTION .text align=%1 | 88 SECTION .text align=%1 |
| 88 fakegot: | 89 fakegot: |
| 89 %elifidn __OUTPUT_FORMAT__,aout | 90 %elifidn __OUTPUT_FORMAT__,aout |
| 90 section .text | 91 SECTION .text |
| 91 %else | 92 %else |
| 92 SECTION .rodata align=%1 | 93 SECTION .rodata align=%1 |
| 93 %endif | 94 %endif |
| 94 %endmacro | 95 %endmacro |
| 95 | 96 |
| 96 ; aout does not support align= | |
| 97 %macro SECTION_TEXT 0-1 16 | 97 %macro SECTION_TEXT 0-1 16 |
| 98 %ifidn __OUTPUT_FORMAT__,aout | 98 %ifidn __OUTPUT_FORMAT__,aout |
| 99 SECTION .text | 99 SECTION .text |
| 100 %else | 100 %else |
| 101 SECTION .text align=%1 | 101 SECTION .text align=%1 |
| 102 %endif | 102 %endif |
| 103 %endmacro | 103 %endmacro |
| 104 | 104 |
| 105 ; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC" | 105 ; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC" |
| 106 ; from original code is added in for 64bit. | 106 ; from original code is added in for 64bit. |
| 107 %ifidn __OUTPUT_FORMAT__,elf32 | 107 %ifidn __OUTPUT_FORMAT__,elf32 |
| 108 %define ABI_IS_32BIT 1 | 108 %define ABI_IS_32BIT 1 |
| 109 %elifidn __OUTPUT_FORMAT__,macho32 | 109 %elifidn __OUTPUT_FORMAT__,macho32 |
| 110 %define ABI_IS_32BIT 1 | 110 %define ABI_IS_32BIT 1 |
| 111 %elifidn __OUTPUT_FORMAT__,win32 | 111 %elifidn __OUTPUT_FORMAT__,win32 |
| 112 %define ABI_IS_32BIT 1 | 112 %define ABI_IS_32BIT 1 |
| 113 %elifidn __OUTPUT_FORMAT__,aout | 113 %elifidn __OUTPUT_FORMAT__,aout |
| 114 %define ABI_IS_32BIT 1 | 114 %define ABI_IS_32BIT 1 |
| 115 %else | 115 %else |
| 116 %define ABI_IS_32BIT 0 | 116 %define ABI_IS_32BIT 0 |
| 117 %endif | 117 %endif |
| 118 | 118 |
| 119 %if ABI_IS_32BIT | 119 %if ABI_IS_32BIT |
| 120 %if CONFIG_PIC=1 | 120 %if CONFIG_PIC=1 |
| 121 %ifidn __OUTPUT_FORMAT__,elf32 | 121 %ifidn __OUTPUT_FORMAT__,elf32 |
| 122 %define GET_GOT_SAVE_ARG 1 | 122 %define GET_GOT_SAVE_ARG 1 |
| 123 %define WRT_PLT wrt ..plt | 123 %define WRT_PLT wrt ..plt |
| 124 %macro GET_GOT 1 | 124 %macro GET_GOT 1 |
| 125 extern _GLOBAL_OFFSET_TABLE_ | 125 extern _GLOBAL_OFFSET_TABLE_ |
| 126 push %1 | 126 push %1 |
| 127 call %%get_got | 127 call %%get_got |
| 128 %%sub_offset: | 128 %%sub_offset: |
| 129 jmp %%exitGG | 129 jmp %%exitGG |
| 130 %%get_got: | 130 %%get_got: |
| 131 mov %1, [esp] | 131 mov %1, [esp] |
| 132 add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc | 132 add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc |
| 133 ret | 133 ret |
| 134 %%exitGG: | 134 %%exitGG: |
| 135 %undef GLOBAL | 135 %undef GLOBAL |
| 136 %define GLOBAL(x) x + %1 wrt ..gotoff | 136 %define GLOBAL(x) x + %1 wrt ..gotoff |
| 137 %undef RESTORE_GOT | 137 %undef RESTORE_GOT |
| 138 %define RESTORE_GOT pop %1 | 138 %define RESTORE_GOT pop %1 |
| 139 %endmacro | 139 %endmacro |
| 140 %elifidn __OUTPUT_FORMAT__,macho32 | 140 %elifidn __OUTPUT_FORMAT__,macho32 |
| 141 %define GET_GOT_SAVE_ARG 1 | 141 %define GET_GOT_SAVE_ARG 1 |
| 142 %macro GET_GOT 1 | 142 %macro GET_GOT 1 |
| 143 push %1 | 143 push %1 |
| 144 call %%get_got | 144 call %%get_got |
| 145 %%get_got: | 145 %%get_got: |
| 146 pop %1 | 146 pop %1 |
| 147 %undef GLOBAL | 147 %undef GLOBAL |
| 148 %define GLOBAL(x) x + %1 - %%get_got | 148 %define GLOBAL(x) x + %1 - %%get_got |
| 149 %undef RESTORE_GOT | 149 %undef RESTORE_GOT |
| 150 %define RESTORE_GOT pop %1 | 150 %define RESTORE_GOT pop %1 |
| 151 %endmacro | 151 %endmacro |
| 152 %endif | 152 %endif |
| 153 %endif | 153 %endif |
| 154 | 154 |
| 155 %if ARCH_X86_64 == 0 | 155 %if ARCH_X86_64 == 0 |
| 156 %undef PIC | 156 %undef PIC |
| 157 %endif | 157 %endif |
| 158 | 158 |
| 159 %else | 159 %else |
| 160 %macro GET_GOT 1 | 160 %macro GET_GOT 1 |
| 161 %endmacro | 161 %endmacro |
| 162 %define GLOBAL(x) rel x | 162 %define GLOBAL(x) rel x |
| 163 %define WRT_PLT wrt ..plt | 163 %define WRT_PLT wrt ..plt |
| 164 | 164 |
| 165 %if WIN64 | 165 %if WIN64 |
| 166 %define PIC | 166 %define PIC |
| 167 %elifidn __OUTPUT_FORMAT__,macho64 | 167 %elifidn __OUTPUT_FORMAT__,macho64 |
| 168 %define PIC | 168 %define PIC |
| 169 %elif CONFIG_PIC | 169 %elif CONFIG_PIC |
| 170 %define PIC | 170 %define PIC |
| 171 %endif | 171 %endif |
| 172 %endif | 172 %endif |
| 173 | 173 |
| 174 %ifnmacro GET_GOT | 174 %ifnmacro GET_GOT |
| 175 %macro GET_GOT 1 | 175 %macro GET_GOT 1 |
| 176 %endmacro | 176 %endmacro |
| 177 %define GLOBAL(x) x | 177 %define GLOBAL(x) x |
| 178 %endif | 178 %endif |
| 179 %ifndef RESTORE_GOT | 179 %ifndef RESTORE_GOT |
| 180 %define RESTORE_GOT | 180 %define RESTORE_GOT |
| 181 %endif | 181 %endif |
| 182 %ifndef WRT_PLT | 182 %ifndef WRT_PLT |
| 183 %define WRT_PLT | 183 %define WRT_PLT |
| 184 %endif | 184 %endif |
| 185 | 185 |
| 186 %ifdef PIC | 186 %ifdef PIC |
| 187 default rel | 187 default rel |
| 188 %endif | 188 %endif |
| 189 ; Done with PIC macros | 189 ; Done with PIC macros |
| 190 | 190 |
| 191 ; Always use long nops (reduces 0x90 spam in disassembly on x86_32) | |
| 192 %ifndef __NASM_VER__ | |
| 193 CPU amdnop | |
| 194 %else | |
| 195 %use smartalign | |
| 196 ALIGNMODE k7 | |
| 197 %endif | |
| 198 | |
| 199 ; Macros to eliminate most code duplication between x86_32 and x86_64: | 191 ; Macros to eliminate most code duplication between x86_32 and x86_64: |
| 200 ; Currently this works only for leaf functions which load all their arguments | 192 ; Currently this works only for leaf functions which load all their arguments |
| 201 ; into registers at the start, and make no other use of the stack. Luckily that | 193 ; into registers at the start, and make no other use of the stack. Luckily that |
| 202 ; covers most of x264's asm. | 194 ; covers most of x264's asm. |
| 203 | 195 |
| 204 ; PROLOGUE: | 196 ; PROLOGUE: |
| 205 ; %1 = number of arguments. loads them from stack if needed. | 197 ; %1 = number of arguments. loads them from stack if needed. |
| 206 ; %2 = number of registers used. pushes callee-saved regs if needed. | 198 ; %2 = number of registers used. pushes callee-saved regs if needed. |
| 207 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. | 199 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. |
| 208 ; %4 = list of names to define to registers | 200 ; %4 = (optional) stack size to be allocated. The stack will be aligned before |
| 201 ; allocating the specified stack size. If the required stack alignment is |
| 202 ; larger than the known stack alignment the stack will be manually aligned |
| 203 ; and an extra register will be allocated to hold the original stack |
| 204 ; pointer (to not invalidate r0m etc.). To prevent the use of an extra |
| 205 ; register as stack pointer, request a negative stack size. |
| 206 ; %4+/%5+ = list of names to define to registers |
| 209 ; PROLOGUE can also be invoked by adding the same options to cglobal | 207 ; PROLOGUE can also be invoked by adding the same options to cglobal |
| 210 | 208 |
| 211 ; e.g. | 209 ; e.g. |
| 212 ; cglobal foo, 2,3,0, dst, src, tmp | 210 ; cglobal foo, 2,3,7,0x40, dst, src, tmp |
| 213 ; declares a function (foo), taking two args (dst and src) and one local variabl
e (tmp) | 211 ; declares a function (foo) that automatically loads two arguments (dst and |
| 212 ; src) into registers, uses one additional register (tmp) plus 7 vector |
| 213 ; registers (m0-m6) and allocates 0x40 bytes of stack space. |
| 214 | 214 |
| 215 ; TODO Some functions can use some args directly from the stack. If they're the | 215 ; TODO Some functions can use some args directly from the stack. If they're the |
| 216 ; last args then you can just not declare them, but if they're in the middle | 216 ; last args then you can just not declare them, but if they're in the middle |
| 217 ; we need more flexible macro. | 217 ; we need more flexible macro. |
| 218 | 218 |
| 219 ; RET: | 219 ; RET: |
| 220 ; Pops anything that was pushed by PROLOGUE, and returns. | 220 ; Pops anything that was pushed by PROLOGUE, and returns. |
| 221 | 221 |
| 222 ; REP_RET: | 222 ; REP_RET: |
| 223 ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons | 223 ; Use this instead of RET if it's a branch target. |
| 224 ; which are slow when a normal ret follows a branch. | |
| 225 | 224 |
| 226 ; registers: | 225 ; registers: |
| 227 ; rN and rNq are the native-size register holding function argument N | 226 ; rN and rNq are the native-size register holding function argument N |
| 228 ; rNd, rNw, rNb are dword, word, and byte size | 227 ; rNd, rNw, rNb are dword, word, and byte size |
| 228 ; rNh is the high 8 bits of the word size |
| 229 ; rNm is the original location of arg N (a register or on the stack), dword | 229 ; rNm is the original location of arg N (a register or on the stack), dword |
| 230 ; rNmp is native size | 230 ; rNmp is native size |
| 231 | 231 |
| 232 %macro DECLARE_REG 5-6 | 232 %macro DECLARE_REG 2-3 |
| 233 %define r%1q %2 | 233 %define r%1q %2 |
| 234 %define r%1d %3 | 234 %define r%1d %2d |
| 235 %define r%1w %4 | 235 %define r%1w %2w |
| 236 %define r%1b %5 | 236 %define r%1b %2b |
| 237 %if %0 == 5 | 237 %define r%1h %2h |
| 238 %define r%1m %3 | 238 %if %0 == 2 |
| 239 %define r%1m %2d |
| 239 %define r%1mp %2 | 240 %define r%1mp %2 |
| 240 %elif ARCH_X86_64 ; memory | 241 %elif ARCH_X86_64 ; memory |
| 241 %define r%1m [rsp + stack_offset + %6] | 242 %define r%1m [rstk + stack_offset + %3] |
| 242 %define r%1mp qword r %+ %1 %+ m | 243 %define r%1mp qword r %+ %1 %+ m |
| 243 %else | 244 %else |
| 244 %define r%1m [esp + stack_offset + %6] | 245 %define r%1m [rstk + stack_offset + %3] |
| 245 %define r%1mp dword r %+ %1 %+ m | 246 %define r%1mp dword r %+ %1 %+ m |
| 246 %endif | 247 %endif |
| 247 %define r%1 %2 | 248 %define r%1 %2 |
| 248 %endmacro | 249 %endmacro |
| 249 | 250 |
| 250 %macro DECLARE_REG_SIZE 2 | 251 %macro DECLARE_REG_SIZE 3 |
| 251 %define r%1q r%1 | 252 %define r%1q r%1 |
| 252 %define e%1q r%1 | 253 %define e%1q r%1 |
| 253 %define r%1d e%1 | 254 %define r%1d e%1 |
| 254 %define e%1d e%1 | 255 %define e%1d e%1 |
| 255 %define r%1w %1 | 256 %define r%1w %1 |
| 256 %define e%1w %1 | 257 %define e%1w %1 |
| 258 %define r%1h %3 |
| 259 %define e%1h %3 |
| 257 %define r%1b %2 | 260 %define r%1b %2 |
| 258 %define e%1b %2 | 261 %define e%1b %2 |
| 259 %if ARCH_X86_64 == 0 | 262 %if ARCH_X86_64 == 0 |
| 260 %define r%1 e%1 | 263 %define r%1 e%1 |
| 261 %endif | 264 %endif |
| 262 %endmacro | 265 %endmacro |
| 263 | 266 |
| 264 DECLARE_REG_SIZE ax, al | 267 DECLARE_REG_SIZE ax, al, ah |
| 265 DECLARE_REG_SIZE bx, bl | 268 DECLARE_REG_SIZE bx, bl, bh |
| 266 DECLARE_REG_SIZE cx, cl | 269 DECLARE_REG_SIZE cx, cl, ch |
| 267 DECLARE_REG_SIZE dx, dl | 270 DECLARE_REG_SIZE dx, dl, dh |
| 268 DECLARE_REG_SIZE si, sil | 271 DECLARE_REG_SIZE si, sil, null |
| 269 DECLARE_REG_SIZE di, dil | 272 DECLARE_REG_SIZE di, dil, null |
| 270 DECLARE_REG_SIZE bp, bpl | 273 DECLARE_REG_SIZE bp, bpl, null |
| 271 | 274 |
| 272 ; t# defines for when per-arch register allocation is more complex than just fun
ction arguments | 275 ; t# defines for when per-arch register allocation is more complex than just fun
ction arguments |
| 273 | 276 |
| 274 %macro DECLARE_REG_TMP 1-* | 277 %macro DECLARE_REG_TMP 1-* |
| 275 %assign %%i 0 | 278 %assign %%i 0 |
| 276 %rep %0 | 279 %rep %0 |
| 277 CAT_XDEFINE t, %%i, r%1 | 280 CAT_XDEFINE t, %%i, r%1 |
| 278 %assign %%i %%i+1 | 281 %assign %%i %%i+1 |
| 279 %rotate 1 | 282 %rotate 1 |
| 280 %endrep | 283 %endrep |
| 281 %endmacro | 284 %endmacro |
| 282 | 285 |
| 283 %macro DECLARE_REG_TMP_SIZE 0-* | 286 %macro DECLARE_REG_TMP_SIZE 0-* |
| 284 %rep %0 | 287 %rep %0 |
| 285 %define t%1q t%1 %+ q | 288 %define t%1q t%1 %+ q |
| 286 %define t%1d t%1 %+ d | 289 %define t%1d t%1 %+ d |
| 287 %define t%1w t%1 %+ w | 290 %define t%1w t%1 %+ w |
| 291 %define t%1h t%1 %+ h |
| 288 %define t%1b t%1 %+ b | 292 %define t%1b t%1 %+ b |
| 289 %rotate 1 | 293 %rotate 1 |
| 290 %endrep | 294 %endrep |
| 291 %endmacro | 295 %endmacro |
| 292 | 296 |
| 293 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 | 297 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 |
| 294 | 298 |
| 295 %if ARCH_X86_64 | 299 %if ARCH_X86_64 |
| 296 %define gprsize 8 | 300 %define gprsize 8 |
| 297 %else | 301 %else |
| 298 %define gprsize 4 | 302 %define gprsize 4 |
| 299 %endif | 303 %endif |
| 300 | 304 |
| 301 %macro PUSH 1 | 305 %macro PUSH 1 |
| 302 push %1 | 306 push %1 |
| 303 %assign stack_offset stack_offset+gprsize | 307 %ifidn rstk, rsp |
| 308 %assign stack_offset stack_offset+gprsize |
| 309 %endif |
| 304 %endmacro | 310 %endmacro |
| 305 | 311 |
| 306 %macro POP 1 | 312 %macro POP 1 |
| 307 pop %1 | 313 pop %1 |
| 308 %assign stack_offset stack_offset-gprsize | 314 %ifidn rstk, rsp |
| 315 %assign stack_offset stack_offset-gprsize |
| 316 %endif |
| 309 %endmacro | 317 %endmacro |
| 310 | 318 |
| 311 %macro PUSH_IF_USED 1-* | 319 %macro PUSH_IF_USED 1-* |
| 312 %rep %0 | 320 %rep %0 |
| 313 %if %1 < regs_used | 321 %if %1 < regs_used |
| 314 PUSH r%1 | 322 PUSH r%1 |
| 315 %endif | 323 %endif |
| 316 %rotate 1 | 324 %rotate 1 |
| 317 %endrep | 325 %endrep |
| 318 %endmacro | 326 %endmacro |
| (...skipping 11 matching lines...) Expand all Loading... |
| 330 %rep %0 | 338 %rep %0 |
| 331 %if %1 < num_args | 339 %if %1 < num_args |
| 332 mov r%1, r %+ %1 %+ mp | 340 mov r%1, r %+ %1 %+ mp |
| 333 %endif | 341 %endif |
| 334 %rotate 1 | 342 %rotate 1 |
| 335 %endrep | 343 %endrep |
| 336 %endmacro | 344 %endmacro |
| 337 | 345 |
| 338 %macro SUB 2 | 346 %macro SUB 2 |
| 339 sub %1, %2 | 347 sub %1, %2 |
| 340 %ifidn %1, rsp | 348 %ifidn %1, rstk |
| 341 %assign stack_offset stack_offset+(%2) | 349 %assign stack_offset stack_offset+(%2) |
| 342 %endif | 350 %endif |
| 343 %endmacro | 351 %endmacro |
| 344 | 352 |
| 345 %macro ADD 2 | 353 %macro ADD 2 |
| 346 add %1, %2 | 354 add %1, %2 |
| 347 %ifidn %1, rsp | 355 %ifidn %1, rstk |
| 348 %assign stack_offset stack_offset-(%2) | 356 %assign stack_offset stack_offset-(%2) |
| 349 %endif | 357 %endif |
| 350 %endmacro | 358 %endmacro |
| 351 | 359 |
| 352 %macro movifnidn 2 | 360 %macro movifnidn 2 |
| 353 %ifnidn %1, %2 | 361 %ifnidn %1, %2 |
| 354 mov %1, %2 | 362 mov %1, %2 |
| 355 %endif | 363 %endif |
| 356 %endmacro | 364 %endmacro |
| 357 | 365 |
| 358 %macro movsxdifnidn 2 | 366 %macro movsxdifnidn 2 |
| 359 %ifnidn %1, %2 | 367 %ifnidn %1, %2 |
| 360 movsxd %1, %2 | 368 movsxd %1, %2 |
| 361 %endif | 369 %endif |
| 362 %endmacro | 370 %endmacro |
| 363 | 371 |
| 364 %macro ASSERT 1 | 372 %macro ASSERT 1 |
| 365 %if (%1) == 0 | 373 %if (%1) == 0 |
| 366 %error assert failed | 374 %error assert failed |
| 367 %endif | 375 %endif |
| 368 %endmacro | 376 %endmacro |
| 369 | 377 |
| 370 %macro DEFINE_ARGS 0-* | 378 %macro DEFINE_ARGS 0-* |
| 371 %ifdef n_arg_names | 379 %ifdef n_arg_names |
| 372 %assign %%i 0 | 380 %assign %%i 0 |
| 373 %rep n_arg_names | 381 %rep n_arg_names |
| 374 CAT_UNDEF arg_name %+ %%i, q | 382 CAT_UNDEF arg_name %+ %%i, q |
| 375 CAT_UNDEF arg_name %+ %%i, d | 383 CAT_UNDEF arg_name %+ %%i, d |
| 376 CAT_UNDEF arg_name %+ %%i, w | 384 CAT_UNDEF arg_name %+ %%i, w |
| 385 CAT_UNDEF arg_name %+ %%i, h |
| 377 CAT_UNDEF arg_name %+ %%i, b | 386 CAT_UNDEF arg_name %+ %%i, b |
| 378 CAT_UNDEF arg_name %+ %%i, m | 387 CAT_UNDEF arg_name %+ %%i, m |
| 379 CAT_UNDEF arg_name %+ %%i, mp | 388 CAT_UNDEF arg_name %+ %%i, mp |
| 380 CAT_UNDEF arg_name, %%i | 389 CAT_UNDEF arg_name, %%i |
| 381 %assign %%i %%i+1 | 390 %assign %%i %%i+1 |
| 382 %endrep | 391 %endrep |
| 383 %endif | 392 %endif |
| 384 | 393 |
| 385 %xdefine %%stack_offset stack_offset | 394 %xdefine %%stack_offset stack_offset |
| 386 %undef stack_offset ; so that the current value of stack_offset doesn't get
baked in by xdefine | 395 %undef stack_offset ; so that the current value of stack_offset doesn't get
baked in by xdefine |
| 387 %assign %%i 0 | 396 %assign %%i 0 |
| 388 %rep %0 | 397 %rep %0 |
| 389 %xdefine %1q r %+ %%i %+ q | 398 %xdefine %1q r %+ %%i %+ q |
| 390 %xdefine %1d r %+ %%i %+ d | 399 %xdefine %1d r %+ %%i %+ d |
| 391 %xdefine %1w r %+ %%i %+ w | 400 %xdefine %1w r %+ %%i %+ w |
| 401 %xdefine %1h r %+ %%i %+ h |
| 392 %xdefine %1b r %+ %%i %+ b | 402 %xdefine %1b r %+ %%i %+ b |
| 393 %xdefine %1m r %+ %%i %+ m | 403 %xdefine %1m r %+ %%i %+ m |
| 394 %xdefine %1mp r %+ %%i %+ mp | 404 %xdefine %1mp r %+ %%i %+ mp |
| 395 CAT_XDEFINE arg_name, %%i, %1 | 405 CAT_XDEFINE arg_name, %%i, %1 |
| 396 %assign %%i %%i+1 | 406 %assign %%i %%i+1 |
| 397 %rotate 1 | 407 %rotate 1 |
| 398 %endrep | 408 %endrep |
| 399 %xdefine stack_offset %%stack_offset | 409 %xdefine stack_offset %%stack_offset |
| 400 %assign n_arg_names %0 | 410 %assign n_arg_names %0 |
| 401 %endmacro | 411 %endmacro |
| 402 | 412 |
| 403 %if ARCH_X86_64 | 413 %define required_stack_alignment ((mmsize + 15) & ~15) |
| 404 %macro ALLOC_STACK 2 ; stack_size, num_regs | 414 |
| 405 %assign %%stack_aligment ((mmsize + 15) & ~15) | 415 %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) |
| 406 %assign stack_size_padded %1 | 416 %ifnum %1 |
| 407 | 417 %if %1 != 0 |
| 408 %assign %%reg_num (%2 - 1) | 418 %assign %%pad 0 |
| 409 %xdefine rsp_tmp r %+ %%reg_num | 419 %assign stack_size %1 |
| 410 mov rsp_tmp, rsp | 420 %if stack_size < 0 |
| 411 sub rsp, stack_size_padded | 421 %assign stack_size -stack_size |
| 412 and rsp, ~(%%stack_aligment - 1) | 422 %endif |
| 413 %endmacro | 423 %if WIN64 |
| 414 | 424 %assign %%pad %%pad + 32 ; shadow space |
| 415 %macro RESTORE_STACK 0 ; reset rsp register | 425 %if mmsize != 8 |
| 416 mov rsp, rsp_tmp | 426 %assign xmm_regs_used %2 |
| 417 %endmacro | 427 %if xmm_regs_used > 8 |
| 418 %endif | 428 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-save
d xmm registers |
| 429 %endif |
| 430 %endif |
| 431 %endif |
| 432 %if required_stack_alignment <= STACK_ALIGNMENT |
| 433 ; maintain the current stack alignment |
| 434 %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_of
fset-gprsize) & (STACK_ALIGNMENT-1)) |
| 435 SUB rsp, stack_size_padded |
| 436 %else |
| 437 %assign %%reg_num (regs_used - 1) |
| 438 %xdefine rstk r %+ %%reg_num |
| 439 ; align stack, and save original stack location directly above |
| 440 ; it, i.e. in [rsp+stack_size_padded], so we can restore the |
| 441 ; stack in a single instruction (i.e. mov rsp, rstk or mov |
| 442 ; rsp, [rsp+stack_size_padded]) |
| 443 %if %1 < 0 ; need to store rsp on stack |
| 444 %xdefine rstkm [rsp + stack_size + %%pad] |
| 445 %assign %%pad %%pad + gprsize |
| 446 %else ; can keep rsp in rstk during whole function |
| 447 %xdefine rstkm rstk |
| 448 %endif |
| 449 %assign stack_size_padded stack_size + ((%%pad + required_stack_
alignment-1) & ~(required_stack_alignment-1)) |
| 450 mov rstk, rsp |
| 451 and rsp, ~(required_stack_alignment-1) |
| 452 sub rsp, stack_size_padded |
| 453 movifnidn rstkm, rstk |
| 454 %endif |
| 455 WIN64_PUSH_XMM |
| 456 %endif |
| 457 %endif |
| 458 %endmacro |
| 459 |
| 460 %macro SETUP_STACK_POINTER 1 |
| 461 %ifnum %1 |
| 462 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT |
| 463 %if %1 > 0 |
| 464 %assign regs_used (regs_used + 1) |
| 465 %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64
* 2 |
| 466 %warning "Stack pointer will overwrite register argument" |
| 467 %endif |
| 468 %endif |
| 469 %endif |
| 470 %endmacro |
| 471 |
| 472 %macro DEFINE_ARGS_INTERNAL 3+ |
| 473 %ifnum %2 |
| 474 DEFINE_ARGS %3 |
| 475 %elif %1 == 4 |
| 476 DEFINE_ARGS %2 |
| 477 %elif %1 > 4 |
| 478 DEFINE_ARGS %2, %3 |
| 479 %endif |
| 480 %endmacro |
| 419 | 481 |
| 420 %if WIN64 ; Windows x64 ;================================================= | 482 %if WIN64 ; Windows x64 ;================================================= |
| 421 | 483 |
| 422 DECLARE_REG 0, rcx, ecx, cx, cl | 484 DECLARE_REG 0, rcx |
| 423 DECLARE_REG 1, rdx, edx, dx, dl | 485 DECLARE_REG 1, rdx |
| 424 DECLARE_REG 2, R8, R8D, R8W, R8B | 486 DECLARE_REG 2, R8 |
| 425 DECLARE_REG 3, R9, R9D, R9W, R9B | 487 DECLARE_REG 3, R9 |
| 426 DECLARE_REG 4, R10, R10D, R10W, R10B, 40 | 488 DECLARE_REG 4, R10, 40 |
| 427 DECLARE_REG 5, R11, R11D, R11W, R11B, 48 | 489 DECLARE_REG 5, R11, 48 |
| 428 DECLARE_REG 6, rax, eax, ax, al, 56 | 490 DECLARE_REG 6, rax, 56 |
| 429 DECLARE_REG 7, rdi, edi, di, dil, 64 | 491 DECLARE_REG 7, rdi, 64 |
| 430 DECLARE_REG 8, rsi, esi, si, sil, 72 | 492 DECLARE_REG 8, rsi, 72 |
| 431 DECLARE_REG 9, rbx, ebx, bx, bl, 80 | 493 DECLARE_REG 9, rbx, 80 |
| 432 DECLARE_REG 10, rbp, ebp, bp, bpl, 88 | 494 DECLARE_REG 10, rbp, 88 |
| 433 DECLARE_REG 11, R12, R12D, R12W, R12B, 96 | 495 DECLARE_REG 11, R12, 96 |
| 434 DECLARE_REG 12, R13, R13D, R13W, R13B, 104 | 496 DECLARE_REG 12, R13, 104 |
| 435 DECLARE_REG 13, R14, R14D, R14W, R14B, 112 | 497 DECLARE_REG 13, R14, 112 |
| 436 DECLARE_REG 14, R15, R15D, R15W, R15B, 120 | 498 DECLARE_REG 14, R15, 120 |
| 437 | 499 |
| 438 %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... | 500 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... |
| 439 %assign num_args %1 | 501 %assign num_args %1 |
| 440 %assign regs_used %2 | 502 %assign regs_used %2 |
| 441 ASSERT regs_used >= num_args | 503 ASSERT regs_used >= num_args |
| 504 SETUP_STACK_POINTER %4 |
| 442 ASSERT regs_used <= 15 | 505 ASSERT regs_used <= 15 |
| 443 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 | 506 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 |
| 444 %if mmsize == 8 | 507 ALLOC_STACK %4, %3 |
| 445 %assign xmm_regs_used 0 | 508 %if mmsize != 8 && stack_size == 0 |
| 446 %else | |
| 447 WIN64_SPILL_XMM %3 | 509 WIN64_SPILL_XMM %3 |
| 448 %endif | 510 %endif |
| 449 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 | 511 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 |
| 450 DEFINE_ARGS %4 | 512 DEFINE_ARGS_INTERNAL %0, %4, %5 |
| 513 %endmacro |
| 514 |
| 515 %macro WIN64_PUSH_XMM 0 |
| 516 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space al
located. |
| 517 %if xmm_regs_used > 6 |
| 518 movaps [rstk + stack_offset + 8], xmm6 |
| 519 %endif |
| 520 %if xmm_regs_used > 7 |
| 521 movaps [rstk + stack_offset + 24], xmm7 |
| 522 %endif |
| 523 %if xmm_regs_used > 8 |
| 524 %assign %%i 8 |
| 525 %rep xmm_regs_used-8 |
| 526 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i |
| 527 %assign %%i %%i+1 |
| 528 %endrep |
| 529 %endif |
| 451 %endmacro | 530 %endmacro |
| 452 | 531 |
| 453 %macro WIN64_SPILL_XMM 1 | 532 %macro WIN64_SPILL_XMM 1 |
| 454 %assign xmm_regs_used %1 | 533 %assign xmm_regs_used %1 |
| 455 ASSERT xmm_regs_used <= 16 | 534 ASSERT xmm_regs_used <= 16 |
| 535 %if xmm_regs_used > 8 |
| 536 ; Allocate stack space for callee-saved xmm registers plus shadow space
and align the stack. |
| 537 %assign %%pad (xmm_regs_used-8)*16 + 32 |
| 538 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STAC
K_ALIGNMENT-1)) |
| 539 SUB rsp, stack_size_padded |
| 540 %endif |
| 541 WIN64_PUSH_XMM |
| 542 %endmacro |
| 543 |
| 544 %macro WIN64_RESTORE_XMM_INTERNAL 1 |
| 545 %assign %%pad_size 0 |
| 546 %if xmm_regs_used > 8 |
| 547 %assign %%i xmm_regs_used |
| 548 %rep xmm_regs_used-8 |
| 549 %assign %%i %%i-1 |
| 550 movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] |
| 551 %endrep |
| 552 %endif |
| 553 %if stack_size_padded > 0 |
| 554 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT |
| 555 mov rsp, rstkm |
| 556 %else |
| 557 add %1, stack_size_padded |
| 558 %assign %%pad_size stack_size_padded |
| 559 %endif |
| 560 %endif |
| 561 %if xmm_regs_used > 7 |
| 562 movaps xmm7, [%1 + stack_offset - %%pad_size + 24] |
| 563 %endif |
| 456 %if xmm_regs_used > 6 | 564 %if xmm_regs_used > 6 |
| 457 SUB rsp, (xmm_regs_used-6)*16+16 | 565 movaps xmm6, [%1 + stack_offset - %%pad_size + 8] |
| 458 %assign %%i xmm_regs_used | |
| 459 %rep (xmm_regs_used-6) | |
| 460 %assign %%i %%i-1 | |
| 461 movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i | |
| 462 %endrep | |
| 463 %endif | |
| 464 %endmacro | |
| 465 | |
| 466 %macro WIN64_RESTORE_XMM_INTERNAL 1 | |
| 467 %if xmm_regs_used > 6 | |
| 468 %assign %%i xmm_regs_used | |
| 469 %rep (xmm_regs_used-6) | |
| 470 %assign %%i %%i-1 | |
| 471 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] | |
| 472 %endrep | |
| 473 add %1, (xmm_regs_used-6)*16+16 | |
| 474 %endif | 566 %endif |
| 475 %endmacro | 567 %endmacro |
| 476 | 568 |
| 477 %macro WIN64_RESTORE_XMM 1 | 569 %macro WIN64_RESTORE_XMM 1 |
| 478 WIN64_RESTORE_XMM_INTERNAL %1 | 570 WIN64_RESTORE_XMM_INTERNAL %1 |
| 479 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 | 571 %assign stack_offset (stack_offset-stack_size_padded) |
| 480 %assign xmm_regs_used 0 | 572 %assign xmm_regs_used 0 |
| 481 %endmacro | 573 %endmacro |
| 482 | 574 |
| 575 %define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack
_size > 0 |
| 576 |
| 483 %macro RET 0 | 577 %macro RET 0 |
| 484 WIN64_RESTORE_XMM_INTERNAL rsp | 578 WIN64_RESTORE_XMM_INTERNAL rsp |
| 485 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 | 579 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 |
| 486 ret | 580 %if mmsize == 32 |
| 487 %endmacro | 581 vzeroupper |
| 488 | 582 %endif |
| 489 %macro REP_RET 0 | 583 AUTO_REP_RET |
| 490 %if regs_used > 7 || xmm_regs_used > 6 | |
| 491 RET | |
| 492 %else | |
| 493 rep ret | |
| 494 %endif | |
| 495 %endmacro | 584 %endmacro |
| 496 | 585 |
| 497 %elif ARCH_X86_64 ; *nix x64 ;============================================= | 586 %elif ARCH_X86_64 ; *nix x64 ;============================================= |
| 498 | 587 |
| 499 DECLARE_REG 0, rdi, edi, di, dil | 588 DECLARE_REG 0, rdi |
| 500 DECLARE_REG 1, rsi, esi, si, sil | 589 DECLARE_REG 1, rsi |
| 501 DECLARE_REG 2, rdx, edx, dx, dl | 590 DECLARE_REG 2, rdx |
| 502 DECLARE_REG 3, rcx, ecx, cx, cl | 591 DECLARE_REG 3, rcx |
| 503 DECLARE_REG 4, R8, R8D, R8W, R8B | 592 DECLARE_REG 4, R8 |
| 504 DECLARE_REG 5, R9, R9D, R9W, R9B | 593 DECLARE_REG 5, R9 |
| 505 DECLARE_REG 6, rax, eax, ax, al, 8 | 594 DECLARE_REG 6, rax, 8 |
| 506 DECLARE_REG 7, R10, R10D, R10W, R10B, 16 | 595 DECLARE_REG 7, R10, 16 |
| 507 DECLARE_REG 8, R11, R11D, R11W, R11B, 24 | 596 DECLARE_REG 8, R11, 24 |
| 508 DECLARE_REG 9, rbx, ebx, bx, bl, 32 | 597 DECLARE_REG 9, rbx, 32 |
| 509 DECLARE_REG 10, rbp, ebp, bp, bpl, 40 | 598 DECLARE_REG 10, rbp, 40 |
| 510 DECLARE_REG 11, R12, R12D, R12W, R12B, 48 | 599 DECLARE_REG 11, R12, 48 |
| 511 DECLARE_REG 12, R13, R13D, R13W, R13B, 56 | 600 DECLARE_REG 12, R13, 56 |
| 512 DECLARE_REG 13, R14, R14D, R14W, R14B, 64 | 601 DECLARE_REG 13, R14, 64 |
| 513 DECLARE_REG 14, R15, R15D, R15W, R15B, 72 | 602 DECLARE_REG 14, R15, 72 |
| 514 | 603 |
| 515 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... | 604 %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... |
| 516 %assign num_args %1 | 605 %assign num_args %1 |
| 517 %assign regs_used %2 | 606 %assign regs_used %2 |
| 518 ASSERT regs_used >= num_args | 607 ASSERT regs_used >= num_args |
| 608 SETUP_STACK_POINTER %4 |
| 519 ASSERT regs_used <= 15 | 609 ASSERT regs_used <= 15 |
| 520 PUSH_IF_USED 9, 10, 11, 12, 13, 14 | 610 PUSH_IF_USED 9, 10, 11, 12, 13, 14 |
| 611 ALLOC_STACK %4 |
| 521 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 | 612 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 |
| 522 DEFINE_ARGS %4 | 613 DEFINE_ARGS_INTERNAL %0, %4, %5 |
| 523 %endmacro | 614 %endmacro |
| 615 |
| 616 %define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 |
| 524 | 617 |
| 525 %macro RET 0 | 618 %macro RET 0 |
| 619 %if stack_size_padded > 0 |
| 620 %if required_stack_alignment > STACK_ALIGNMENT |
| 621 mov rsp, rstkm |
| 622 %else |
| 623 add rsp, stack_size_padded |
| 624 %endif |
| 625 %endif |
| 526 POP_IF_USED 14, 13, 12, 11, 10, 9 | 626 POP_IF_USED 14, 13, 12, 11, 10, 9 |
| 527 ret | 627 %if mmsize == 32 |
| 528 %endmacro | 628 vzeroupper |
| 529 | 629 %endif |
| 530 %macro REP_RET 0 | 630 AUTO_REP_RET |
| 531 %if regs_used > 9 | |
| 532 RET | |
| 533 %else | |
| 534 rep ret | |
| 535 %endif | |
| 536 %endmacro | 631 %endmacro |
| 537 | 632 |
| 538 %else ; X86_32 ;============================================================== | 633 %else ; X86_32 ;============================================================== |
| 539 | 634 |
| 540 DECLARE_REG 0, eax, eax, ax, al, 4 | 635 DECLARE_REG 0, eax, 4 |
| 541 DECLARE_REG 1, ecx, ecx, cx, cl, 8 | 636 DECLARE_REG 1, ecx, 8 |
| 542 DECLARE_REG 2, edx, edx, dx, dl, 12 | 637 DECLARE_REG 2, edx, 12 |
| 543 DECLARE_REG 3, ebx, ebx, bx, bl, 16 | 638 DECLARE_REG 3, ebx, 16 |
| 544 DECLARE_REG 4, esi, esi, si, null, 20 | 639 DECLARE_REG 4, esi, 20 |
| 545 DECLARE_REG 5, edi, edi, di, null, 24 | 640 DECLARE_REG 5, edi, 24 |
| 546 DECLARE_REG 6, ebp, ebp, bp, null, 28 | 641 DECLARE_REG 6, ebp, 28 |
| 547 %define rsp esp | 642 %define rsp esp |
| 548 | 643 |
| 549 %macro DECLARE_ARG 1-* | 644 %macro DECLARE_ARG 1-* |
| 550 %rep %0 | 645 %rep %0 |
| 551 %define r%1m [esp + stack_offset + 4*%1 + 4] | 646 %define r%1m [rstk + stack_offset + 4*%1 + 4] |
| 552 %define r%1mp dword r%1m | 647 %define r%1mp dword r%1m |
| 553 %rotate 1 | 648 %rotate 1 |
| 554 %endrep | 649 %endrep |
| 555 %endmacro | 650 %endmacro |
| 556 | 651 |
| 557 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 | 652 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 |
| 558 | 653 |
| 559 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... | 654 %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... |
| 560 %assign num_args %1 | 655 %assign num_args %1 |
| 561 %assign regs_used %2 | 656 %assign regs_used %2 |
| 657 ASSERT regs_used >= num_args |
| 658 %if num_args > 7 |
| 659 %assign num_args 7 |
| 660 %endif |
| 562 %if regs_used > 7 | 661 %if regs_used > 7 |
| 563 %assign regs_used 7 | 662 %assign regs_used 7 |
| 564 %endif | 663 %endif |
| 565 ASSERT regs_used >= num_args | 664 SETUP_STACK_POINTER %4 |
| 665 ASSERT regs_used <= 7 |
| 566 PUSH_IF_USED 3, 4, 5, 6 | 666 PUSH_IF_USED 3, 4, 5, 6 |
| 667 ALLOC_STACK %4 |
| 567 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 | 668 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 |
| 568 DEFINE_ARGS %4 | 669 DEFINE_ARGS_INTERNAL %0, %4, %5 |
| 569 %endmacro | 670 %endmacro |
| 570 | 671 |
| 672 %define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 |
| 673 |
| 571 %macro RET 0 | 674 %macro RET 0 |
| 675 %if stack_size_padded > 0 |
| 676 %if required_stack_alignment > STACK_ALIGNMENT |
| 677 mov rsp, rstkm |
| 678 %else |
| 679 add rsp, stack_size_padded |
| 680 %endif |
| 681 %endif |
| 572 POP_IF_USED 6, 5, 4, 3 | 682 POP_IF_USED 6, 5, 4, 3 |
| 573 ret | 683 %if mmsize == 32 |
| 574 %endmacro | 684 vzeroupper |
| 575 | 685 %endif |
| 576 %macro REP_RET 0 | 686 AUTO_REP_RET |
| 577 %if regs_used > 3 | |
| 578 RET | |
| 579 %else | |
| 580 rep ret | |
| 581 %endif | |
| 582 %endmacro | 687 %endmacro |
| 583 | 688 |
| 584 %endif ;====================================================================== | 689 %endif ;====================================================================== |
| 585 | 690 |
| 586 %if WIN64 == 0 | 691 %if WIN64 == 0 |
| 587 %macro WIN64_SPILL_XMM 1 | 692 %macro WIN64_SPILL_XMM 1 |
| 588 %endmacro | 693 %endmacro |
| 589 %macro WIN64_RESTORE_XMM 1 | 694 %macro WIN64_RESTORE_XMM 1 |
| 590 %endmacro | 695 %endmacro |
| 696 %macro WIN64_PUSH_XMM 0 |
| 697 %endmacro |
| 591 %endif | 698 %endif |
| 592 | 699 |
| 700 ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either |
| 701 ; a branch or a branch target. So switch to a 2-byte form of ret in that case. |
| 702 ; We can automatically detect "follows a branch", but not a branch target. |
| 703 ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this probl
em.) |
| 704 %macro REP_RET 0 |
| 705 %if has_epilogue |
| 706 RET |
| 707 %else |
| 708 rep ret |
| 709 %endif |
| 710 %endmacro |
| 711 |
| 712 %define last_branch_adr $$ |
| 713 %macro AUTO_REP_RET 0 |
| 714 %ifndef cpuflags |
| 715 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr
. |
| 716 %elif notcpuflag(ssse3) |
| 717 times ((last_branch_adr-$)>>31)+1 rep |
| 718 %endif |
| 719 ret |
| 720 %endmacro |
| 721 |
| 722 %macro BRANCH_INSTR 0-* |
| 723 %rep %0 |
| 724 %macro %1 1-2 %1 |
| 725 %2 %1 |
| 726 %%branch_instr: |
| 727 %xdefine last_branch_adr %%branch_instr |
| 728 %endmacro |
| 729 %rotate 1 |
| 730 %endrep |
| 731 %endmacro |
| 732 |
| 733 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp |
| 734 |
| 735 %macro TAIL_CALL 2 ; callee, is_nonadjacent |
| 736 %if has_epilogue |
| 737 call %1 |
| 738 RET |
| 739 %elif %2 |
| 740 jmp %1 |
| 741 %endif |
| 742 %endmacro |
| 743 |
| 593 ;============================================================================= | 744 ;============================================================================= |
| 594 ; arch-independent part | 745 ; arch-independent part |
| 595 ;============================================================================= | 746 ;============================================================================= |
| 596 | 747 |
| 597 %assign function_align 16 | 748 %assign function_align 16 |
| 598 | 749 |
| 599 ; Begin a function. | 750 ; Begin a function. |
| 600 ; Applies any symbol mangling needed for C linkage, and sets up a define such th
at | 751 ; Applies any symbol mangling needed for C linkage, and sets up a define such th
at |
| 601 ; subsequent uses of the function name automatically refer to the mangled versio
n. | 752 ; subsequent uses of the function name automatically refer to the mangled versio
n. |
| 602 ; Appends cpuflags to the function name if cpuflags has been specified. | 753 ; Appends cpuflags to the function name if cpuflags has been specified. |
| 603 %macro cglobal 1-2+ ; name, [PROLOGUE args] | 754 ; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX |
| 604 %if %0 == 1 | 755 ; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). |
| 605 cglobal_internal %1 %+ SUFFIX | 756 %macro cglobal 1-2+ "" ; name, [PROLOGUE args] |
| 606 %else | 757 cglobal_internal 1, %1 %+ SUFFIX, %2 |
| 607 cglobal_internal %1 %+ SUFFIX, %2 | |
| 608 %endif | |
| 609 %endmacro | 758 %endmacro |
| 610 %macro cglobal_internal 1-2+ | 759 %macro cvisible 1-2+ "" ; name, [PROLOGUE args] |
| 611 %ifndef cglobaled_%1 | 760 cglobal_internal 0, %1 %+ SUFFIX, %2 |
| 612 %xdefine %1 mangle(program_name %+ _ %+ %1) | 761 %endmacro |
| 613 %xdefine %1.skip_prologue %1 %+ .skip_prologue | 762 %macro cglobal_internal 2-3+ |
| 614 CAT_XDEFINE cglobaled_, %1, 1 | 763 %if %1 |
| 615 %endif | 764 %xdefine %%FUNCTION_PREFIX private_prefix |
| 616 %xdefine current_function %1 | 765 ; libvpx explicitly sets visibility in shared object builds. Avoid |
| 617 %ifdef CHROMIUM | 766 ; setting visibility to hidden as it may break builds that split |
| 618 %ifidn __OUTPUT_FORMAT__,elf | 767 ; sources on e.g., directory boundaries. |
| 619 global %1:function hidden | 768 %ifdef CHROMIUM |
| 620 %elifidn __OUTPUT_FORMAT__,elf32 | 769 %xdefine %%VISIBILITY hidden |
| 621 global %1:function hidden | |
| 622 %elifidn __OUTPUT_FORMAT__,elf64 | |
| 623 global %1:function hidden | |
| 624 %elifidn __OUTPUT_FORMAT__,macho32 | |
| 625 %ifdef __NASM_VER__ | |
| 626 global %1 | |
| 627 %else | |
| 628 global %1:private_extern | |
| 629 %endif | |
| 630 %elifidn __OUTPUT_FORMAT__,macho64 | |
| 631 %ifdef __NASM_VER__ | |
| 632 global %1 | |
| 633 %else | |
| 634 global %1:private_extern | |
| 635 %endif | |
| 636 %else | 770 %else |
| 637 global %1 | 771 %xdefine %%VISIBILITY |
| 638 %endif | 772 %endif |
| 639 %else | 773 %else |
| 640 global %1 | 774 %xdefine %%FUNCTION_PREFIX public_prefix |
| 775 %xdefine %%VISIBILITY |
| 776 %endif |
| 777 %ifndef cglobaled_%2 |
| 778 %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) |
| 779 %xdefine %2.skip_prologue %2 %+ .skip_prologue |
| 780 CAT_XDEFINE cglobaled_, %2, 1 |
| 781 %endif |
| 782 %xdefine current_function %2 |
| 783 %ifidn __OUTPUT_FORMAT__,elf32 |
| 784 global %2:function %%VISIBILITY |
| 785 %elifidn __OUTPUT_FORMAT__,elf64 |
| 786 global %2:function %%VISIBILITY |
| 787 %elifidn __OUTPUT_FORMAT__,macho32 |
| 788 %ifdef __NASM_VER__ |
| 789 global %2 |
| 790 %else |
| 791 global %2:private_extern |
| 792 %endif |
| 793 %elifidn __OUTPUT_FORMAT__,macho64 |
| 794 %ifdef __NASM_VER__ |
| 795 global %2 |
| 796 %else |
| 797 global %2:private_extern |
| 798 %endif |
| 799 %else |
| 800 global %2 |
| 641 %endif | 801 %endif |
| 642 align function_align | 802 align function_align |
| 643 %1: | 803 %2: |
| 644 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nic
er | 804 RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly some
what nicer |
| 645 %assign stack_offset 0 | 805 %xdefine rstk rsp ; copy of the original stack pointer, used when
greater alignment than the known stack alignment is required |
| 646 %if %0 > 1 | 806 %assign stack_offset 0 ; stack pointer offset relative to the return ad
dress |
| 647 PROLOGUE %2 | 807 %assign stack_size 0 ; amount of stack space that can be freely used
inside a function |
| 808 %assign stack_size_padded 0 ; total amount of allocated stack space, includi
ng space for callee-saved xmm registers on WIN64 and alignment padding |
| 809 %assign xmm_regs_used 0 ; number of XMM registers requested, used for de
aling with callee-saved registers on WIN64 |
| 810 %ifnidn %3, "" |
| 811 PROLOGUE %3 |
| 648 %endif | 812 %endif |
| 649 %endmacro | 813 %endmacro |
| 650 | 814 |
| 651 %macro cextern 1 | 815 %macro cextern 1 |
| 652 %xdefine %1 mangle(program_name %+ _ %+ %1) | 816 %xdefine %1 mangle(private_prefix %+ _ %+ %1) |
| 653 CAT_XDEFINE cglobaled_, %1, 1 | 817 CAT_XDEFINE cglobaled_, %1, 1 |
| 654 extern %1 | 818 extern %1 |
| 655 %endmacro | 819 %endmacro |
| 656 | 820 |
| 657 ; like cextern, but without the prefix | 821 ; like cextern, but without the prefix |
| 658 %macro cextern_naked 1 | 822 %macro cextern_naked 1 |
| 659 %xdefine %1 mangle(%1) | 823 %xdefine %1 mangle(%1) |
| 660 CAT_XDEFINE cglobaled_, %1, 1 | 824 CAT_XDEFINE cglobaled_, %1, 1 |
| 661 extern %1 | 825 extern %1 |
| 662 %endmacro | 826 %endmacro |
| 663 | 827 |
| 664 %macro const 2+ | 828 %macro const 1-2+ |
| 665 %xdefine %1 mangle(program_name %+ _ %+ %1) | 829 %xdefine %1 mangle(private_prefix %+ _ %+ %1) |
| 666 global %1 | 830 %ifidn __OUTPUT_FORMAT__,elf32 |
| 831 global %1:data hidden |
| 832 %elifidn __OUTPUT_FORMAT__,elf64 |
| 833 global %1:data hidden |
| 834 %else |
| 835 global %1 |
| 836 %endif |
| 667 %1: %2 | 837 %1: %2 |
| 668 %endmacro | 838 %endmacro |
| 669 | 839 |
| 670 ; This is needed for ELF, otherwise the GNU linker assumes the stack is | 840 ; This is needed for ELF, otherwise the GNU linker assumes the stack is |
| 671 ; executable by default. | 841 ; executable by default. |
| 672 %ifidn __OUTPUT_FORMAT__,elf | 842 %ifidn __OUTPUT_FORMAT__,elf32 |
| 673 SECTION .note.GNU-stack noalloc noexec nowrite progbits | |
| 674 %elifidn __OUTPUT_FORMAT__,elf32 | |
| 675 SECTION .note.GNU-stack noalloc noexec nowrite progbits | 843 SECTION .note.GNU-stack noalloc noexec nowrite progbits |
| 676 %elifidn __OUTPUT_FORMAT__,elf64 | 844 %elifidn __OUTPUT_FORMAT__,elf64 |
| 677 SECTION .note.GNU-stack noalloc noexec nowrite progbits | 845 SECTION .note.GNU-stack noalloc noexec nowrite progbits |
| 678 %endif | 846 %endif |
| 679 | 847 |
| 680 ; cpuflags | 848 ; cpuflags |
| 681 | 849 |
| 682 %assign cpuflags_mmx (1<<0) | 850 %assign cpuflags_mmx (1<<0) |
| 683 %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx | 851 %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx |
| 684 %assign cpuflags_3dnow (1<<2) | cpuflags_mmx | 852 %assign cpuflags_3dnow (1<<2) | cpuflags_mmx |
| 685 %assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow | 853 %assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow |
| 686 %assign cpuflags_sse (1<<4) | cpuflags_mmx2 | 854 %assign cpuflags_sse (1<<4) | cpuflags_mmx2 |
| 687 %assign cpuflags_sse2 (1<<5) | cpuflags_sse | 855 %assign cpuflags_sse2 (1<<5) | cpuflags_sse |
| 688 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 | 856 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 |
| 689 %assign cpuflags_sse3 (1<<7) | cpuflags_sse2 | 857 %assign cpuflags_sse3 (1<<7) | cpuflags_sse2 |
| 690 %assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 | 858 %assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 |
| 691 %assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 | 859 %assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 |
| 692 %assign cpuflags_sse42 (1<<10)| cpuflags_sse4 | 860 %assign cpuflags_sse42 (1<<10)| cpuflags_sse4 |
| 693 %assign cpuflags_avx (1<<11)| cpuflags_sse42 | 861 %assign cpuflags_avx (1<<11)| cpuflags_sse42 |
| 694 %assign cpuflags_xop (1<<12)| cpuflags_avx | 862 %assign cpuflags_xop (1<<12)| cpuflags_avx |
| 695 %assign cpuflags_fma4 (1<<13)| cpuflags_avx | 863 %assign cpuflags_fma4 (1<<13)| cpuflags_avx |
| 864 %assign cpuflags_fma3 (1<<14)| cpuflags_avx |
| 865 %assign cpuflags_avx2 (1<<15)| cpuflags_fma3 |
| 696 | 866 |
| 697 %assign cpuflags_cache32 (1<<16) | 867 %assign cpuflags_cache32 (1<<16) |
| 698 %assign cpuflags_cache64 (1<<17) | 868 %assign cpuflags_cache64 (1<<17) |
| 699 %assign cpuflags_slowctz (1<<18) | 869 %assign cpuflags_slowctz (1<<18) |
| 700 %assign cpuflags_lzcnt (1<<19) | 870 %assign cpuflags_lzcnt (1<<19) |
| 701 %assign cpuflags_misalign (1<<20) | 871 %assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant |
| 702 %assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant | 872 %assign cpuflags_atom (1<<21) |
| 703 %assign cpuflags_atom (1<<22) | 873 %assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt |
| 874 %assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 |
| 704 | 875 |
| 705 %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) | 876 %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) |
| 706 %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) | 877 %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) |
| 707 | 878 |
| 708 ; Takes up to 2 cpuflags from the above list. | 879 ; Takes an arbitrary number of cpuflags from the above list. |
| 709 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the speci
fied cpu. | 880 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the speci
fied cpu. |
| 710 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_M
MX &co. | 881 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_M
MX &co. |
| 711 %macro INIT_CPUFLAGS 0-2 | 882 %macro INIT_CPUFLAGS 0-* |
| 883 %xdefine SUFFIX |
| 884 %undef cpuname |
| 885 %assign cpuflags 0 |
| 886 |
| 712 %if %0 >= 1 | 887 %if %0 >= 1 |
| 713 %xdefine cpuname %1 | 888 %rep %0 |
| 714 %assign cpuflags cpuflags_%1 | 889 %ifdef cpuname |
| 715 %if %0 >= 2 | 890 %xdefine cpuname cpuname %+ _%1 |
| 716 %xdefine cpuname %1_%2 | 891 %else |
| 717 %assign cpuflags cpuflags | cpuflags_%2 | 892 %xdefine cpuname %1 |
| 718 %endif | 893 %endif |
| 894 %assign cpuflags cpuflags | cpuflags_%1 |
| 895 %rotate 1 |
| 896 %endrep |
| 719 %xdefine SUFFIX _ %+ cpuname | 897 %xdefine SUFFIX _ %+ cpuname |
| 898 |
| 720 %if cpuflag(avx) | 899 %if cpuflag(avx) |
| 721 %assign avx_enabled 1 | 900 %assign avx_enabled 1 |
| 722 %endif | 901 %endif |
| 723 %if mmsize == 16 && notcpuflag(sse2) | 902 %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(av
x2)) |
| 724 %define mova movaps | 903 %define mova movaps |
| 725 %define movu movups | 904 %define movu movups |
| 726 %define movnta movntps | 905 %define movnta movntps |
| 727 %endif | 906 %endif |
| 728 %if cpuflag(aligned) | 907 %if cpuflag(aligned) |
| 729 %define movu mova | 908 %define movu mova |
| 730 %elifidn %1, sse3 | 909 %elif cpuflag(sse3) && notcpuflag(ssse3) |
| 731 %define movu lddqu | 910 %define movu lddqu |
| 732 %endif | 911 %endif |
| 912 %endif |
| 913 |
| 914 %ifdef __NASM_VER__ |
| 915 %use smartalign |
| 916 ALIGNMODE k7 |
| 917 %elif ARCH_X86_64 || cpuflag(sse2) |
| 918 CPU amdnop |
| 733 %else | 919 %else |
| 734 %xdefine SUFFIX | 920 CPU basicnop |
| 735 %undef cpuname | |
| 736 %undef cpuflags | |
| 737 %endif | 921 %endif |
| 738 %endmacro | 922 %endmacro |
| 739 | 923 |
| 740 ; merge mmx and sse* | 924 ; Merge mmx and sse* |
| 925 ; m# is a simd register of the currently selected size |
| 926 ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m
# |
| 927 ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m
# |
| 928 ; (All 3 remain in sync through SWAP.) |
| 741 | 929 |
| 742 %macro CAT_XDEFINE 3 | 930 %macro CAT_XDEFINE 3 |
| 743 %xdefine %1%2 %3 | 931 %xdefine %1%2 %3 |
| 744 %endmacro | 932 %endmacro |
| 745 | 933 |
| 746 %macro CAT_UNDEF 2 | 934 %macro CAT_UNDEF 2 |
| 747 %undef %1%2 | 935 %undef %1%2 |
| 748 %endmacro | 936 %endmacro |
| 749 | 937 |
| 750 %macro INIT_MMX 0-1+ | 938 %macro INIT_MMX 0-1+ |
| 751 %assign avx_enabled 0 | 939 %assign avx_enabled 0 |
| 752 %define RESET_MM_PERMUTATION INIT_MMX %1 | 940 %define RESET_MM_PERMUTATION INIT_MMX %1 |
| 753 %define mmsize 8 | 941 %define mmsize 8 |
| 754 %define num_mmregs 8 | 942 %define num_mmregs 8 |
| 755 %define mova movq | 943 %define mova movq |
| 756 %define movu movq | 944 %define movu movq |
| 757 %define movh movd | 945 %define movh movd |
| 758 %define movnta movntq | 946 %define movnta movntq |
| 759 %assign %%i 0 | 947 %assign %%i 0 |
| 760 %rep 8 | 948 %rep 8 |
| 761 CAT_XDEFINE m, %%i, mm %+ %%i | 949 CAT_XDEFINE m, %%i, mm %+ %%i |
| 762 CAT_XDEFINE nmm, %%i, %%i | 950 CAT_XDEFINE nnmm, %%i, %%i |
| 763 %assign %%i %%i+1 | 951 %assign %%i %%i+1 |
| 764 %endrep | 952 %endrep |
| 765 %rep 8 | 953 %rep 8 |
| 766 CAT_UNDEF m, %%i | 954 CAT_UNDEF m, %%i |
| 767 CAT_UNDEF nmm, %%i | 955 CAT_UNDEF nnmm, %%i |
| 768 %assign %%i %%i+1 | 956 %assign %%i %%i+1 |
| 769 %endrep | 957 %endrep |
| 770 INIT_CPUFLAGS %1 | 958 INIT_CPUFLAGS %1 |
| 771 %endmacro | 959 %endmacro |
| 772 | 960 |
| 773 %macro INIT_XMM 0-1+ | 961 %macro INIT_XMM 0-1+ |
| 774 %assign avx_enabled 0 | 962 %assign avx_enabled 0 |
| 775 %define RESET_MM_PERMUTATION INIT_XMM %1 | 963 %define RESET_MM_PERMUTATION INIT_XMM %1 |
| 776 %define mmsize 16 | 964 %define mmsize 16 |
| 777 %define num_mmregs 8 | 965 %define num_mmregs 8 |
| 778 %if ARCH_X86_64 | 966 %if ARCH_X86_64 |
| 779 %define num_mmregs 16 | 967 %define num_mmregs 16 |
| 780 %endif | 968 %endif |
| 781 %define mova movdqa | 969 %define mova movdqa |
| 782 %define movu movdqu | 970 %define movu movdqu |
| 783 %define movh movq | 971 %define movh movq |
| 784 %define movnta movntdq | 972 %define movnta movntdq |
| 785 %assign %%i 0 | 973 %assign %%i 0 |
| 786 %rep num_mmregs | 974 %rep num_mmregs |
| 787 CAT_XDEFINE m, %%i, xmm %+ %%i | 975 CAT_XDEFINE m, %%i, xmm %+ %%i |
| 788 CAT_XDEFINE nxmm, %%i, %%i | 976 CAT_XDEFINE nnxmm, %%i, %%i |
| 789 %assign %%i %%i+1 | 977 %assign %%i %%i+1 |
| 790 %endrep | 978 %endrep |
| 791 INIT_CPUFLAGS %1 | 979 INIT_CPUFLAGS %1 |
| 792 %endmacro | 980 %endmacro |
| 793 | 981 |
| 794 ; FIXME: INIT_AVX can be replaced by INIT_XMM avx | |
| 795 %macro INIT_AVX 0 | |
| 796 INIT_XMM | |
| 797 %assign avx_enabled 1 | |
| 798 %define PALIGNR PALIGNR_SSSE3 | |
| 799 %define RESET_MM_PERMUTATION INIT_AVX | |
| 800 %endmacro | |
| 801 | |
| 802 %macro INIT_YMM 0-1+ | 982 %macro INIT_YMM 0-1+ |
| 803 %assign avx_enabled 1 | 983 %assign avx_enabled 1 |
| 804 %define RESET_MM_PERMUTATION INIT_YMM %1 | 984 %define RESET_MM_PERMUTATION INIT_YMM %1 |
| 805 %define mmsize 32 | 985 %define mmsize 32 |
| 806 %define num_mmregs 8 | 986 %define num_mmregs 8 |
| 807 %if ARCH_X86_64 | 987 %if ARCH_X86_64 |
| 808 %define num_mmregs 16 | 988 %define num_mmregs 16 |
| 809 %endif | 989 %endif |
| 810 %define mova vmovaps | 990 %define mova movdqa |
| 811 %define movu vmovups | 991 %define movu movdqu |
| 812 %undef movh | 992 %undef movh |
| 813 %define movnta vmovntps | 993 %define movnta movntdq |
| 814 %assign %%i 0 | 994 %assign %%i 0 |
| 815 %rep num_mmregs | 995 %rep num_mmregs |
| 816 CAT_XDEFINE m, %%i, ymm %+ %%i | 996 CAT_XDEFINE m, %%i, ymm %+ %%i |
| 817 CAT_XDEFINE nymm, %%i, %%i | 997 CAT_XDEFINE nnymm, %%i, %%i |
| 818 %assign %%i %%i+1 | 998 %assign %%i %%i+1 |
| 819 %endrep | 999 %endrep |
| 820 INIT_CPUFLAGS %1 | 1000 INIT_CPUFLAGS %1 |
| 821 %endmacro | 1001 %endmacro |
| 822 | 1002 |
| 823 INIT_XMM | 1003 INIT_XMM |
| 824 | 1004 |
| 1005 %macro DECLARE_MMCAST 1 |
| 1006 %define mmmm%1 mm%1 |
| 1007 %define mmxmm%1 mm%1 |
| 1008 %define mmymm%1 mm%1 |
| 1009 %define xmmmm%1 mm%1 |
| 1010 %define xmmxmm%1 xmm%1 |
| 1011 %define xmmymm%1 xmm%1 |
| 1012 %define ymmmm%1 mm%1 |
| 1013 %define ymmxmm%1 xmm%1 |
| 1014 %define ymmymm%1 ymm%1 |
| 1015 %define xm%1 xmm %+ m%1 |
| 1016 %define ym%1 ymm %+ m%1 |
| 1017 %endmacro |
| 1018 |
| 1019 %assign i 0 |
| 1020 %rep 16 |
| 1021 DECLARE_MMCAST i |
| 1022 %assign i i+1 |
| 1023 %endrep |
| 1024 |
| 825 ; I often want to use macros that permute their arguments. e.g. there's no | 1025 ; I often want to use macros that permute their arguments. e.g. there's no |
| 826 ; efficient way to implement butterfly or transpose or dct without swapping some | 1026 ; efficient way to implement butterfly or transpose or dct without swapping some |
| 827 ; arguments. | 1027 ; arguments. |
| 828 ; | 1028 ; |
| 829 ; I would like to not have to manually keep track of the permutations: | 1029 ; I would like to not have to manually keep track of the permutations: |
| 830 ; If I insert a permutation in the middle of a function, it should automatically | 1030 ; If I insert a permutation in the middle of a function, it should automatically |
| 831 ; change everything that follows. For more complex macros I may also have multip
le | 1031 ; change everything that follows. For more complex macros I may also have multip
le |
| 832 ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutati
ons. | 1032 ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutati
ons. |
| 833 ; | 1033 ; |
| 834 ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that | 1034 ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that |
| 835 ; permutes its arguments. It's equivalent to exchanging the contents of the | 1035 ; permutes its arguments. It's equivalent to exchanging the contents of the |
| 836 ; registers, except that this way you exchange the register names instead, so it | 1036 ; registers, except that this way you exchange the register names instead, so it |
| 837 ; doesn't cost any cycles. | 1037 ; doesn't cost any cycles. |
| 838 | 1038 |
| 839 %macro PERMUTE 2-* ; takes a list of pairs to swap | 1039 %macro PERMUTE 2-* ; takes a list of pairs to swap |
| 840 %rep %0/2 | 1040 %rep %0/2 |
| 841 %xdefine tmp%2 m%2 | 1041 %xdefine %%tmp%2 m%2 |
| 842 %xdefine ntmp%2 nm%2 | |
| 843 %rotate 2 | 1042 %rotate 2 |
| 844 %endrep | 1043 %endrep |
| 845 %rep %0/2 | 1044 %rep %0/2 |
| 846 %xdefine m%1 tmp%2 | 1045 %xdefine m%1 %%tmp%2 |
| 847 %xdefine nm%1 ntmp%2 | 1046 CAT_XDEFINE nn, m%1, %1 |
| 848 %undef tmp%2 | |
| 849 %undef ntmp%2 | |
| 850 %rotate 2 | 1047 %rotate 2 |
| 851 %endrep | 1048 %endrep |
| 852 %endmacro | 1049 %endmacro |
| 853 | 1050 |
| 854 %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) | 1051 %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) |
| 855 %rep %0-1 | 1052 %ifnum %1 ; SWAP 0, 1, ... |
| 856 %ifdef m%1 | 1053 SWAP_INTERNAL_NUM %1, %2 |
| 857 %xdefine tmp m%1 | 1054 %else ; SWAP m0, m1, ... |
| 858 %xdefine m%1 m%2 | 1055 SWAP_INTERNAL_NAME %1, %2 |
| 859 %xdefine m%2 tmp | |
| 860 CAT_XDEFINE n, m%1, %1 | |
| 861 CAT_XDEFINE n, m%2, %2 | |
| 862 %else | |
| 863 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the origina
l numbers here. | |
| 864 ; Be careful using this mode in nested macros though, as in some cases there
may be | |
| 865 ; other copies of m# that have already been dereferenced and don't get updat
ed correctly. | |
| 866 %xdefine %%n1 n %+ %1 | |
| 867 %xdefine %%n2 n %+ %2 | |
| 868 %xdefine tmp m %+ %%n1 | |
| 869 CAT_XDEFINE m, %%n1, m %+ %%n2 | |
| 870 CAT_XDEFINE m, %%n2, tmp | |
| 871 CAT_XDEFINE n, m %+ %%n1, %%n1 | |
| 872 CAT_XDEFINE n, m %+ %%n2, %%n2 | |
| 873 %endif | 1056 %endif |
| 874 %undef tmp | 1057 %endmacro |
| 1058 |
| 1059 %macro SWAP_INTERNAL_NUM 2-* |
| 1060 %rep %0-1 |
| 1061 %xdefine %%tmp m%1 |
| 1062 %xdefine m%1 m%2 |
| 1063 %xdefine m%2 %%tmp |
| 1064 CAT_XDEFINE nn, m%1, %1 |
| 1065 CAT_XDEFINE nn, m%2, %2 |
| 875 %rotate 1 | 1066 %rotate 1 |
| 876 %endrep | 1067 %endrep |
| 1068 %endmacro |
| 1069 |
| 1070 %macro SWAP_INTERNAL_NAME 2-* |
| 1071 %xdefine %%args nn %+ %1 |
| 1072 %rep %0-1 |
| 1073 %xdefine %%args %%args, nn %+ %2 |
| 1074 %rotate 1 |
| 1075 %endrep |
| 1076 SWAP_INTERNAL_NUM %%args |
| 877 %endmacro | 1077 %endmacro |
| 878 | 1078 |
| 879 ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later | 1079 ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later |
| 880 ; calls to that function will automatically load the permutation, so values can | 1080 ; calls to that function will automatically load the permutation, so values can |
| 881 ; be returned in mmregs. | 1081 ; be returned in mmregs. |
| 882 %macro SAVE_MM_PERMUTATION 0-1 | 1082 %macro SAVE_MM_PERMUTATION 0-1 |
| 883 %if %0 | 1083 %if %0 |
| 884 %xdefine %%f %1_m | 1084 %xdefine %%f %1_m |
| 885 %else | 1085 %else |
| 886 %xdefine %%f current_function %+ _m | 1086 %xdefine %%f current_function %+ _m |
| 887 %endif | 1087 %endif |
| 888 %assign %%i 0 | 1088 %assign %%i 0 |
| 889 %rep num_mmregs | 1089 %rep num_mmregs |
| 890 CAT_XDEFINE %%f, %%i, m %+ %%i | 1090 CAT_XDEFINE %%f, %%i, m %+ %%i |
| 891 %assign %%i %%i+1 | 1091 %assign %%i %%i+1 |
| 892 %endrep | 1092 %endrep |
| 893 %endmacro | 1093 %endmacro |
| 894 | 1094 |
| 895 %macro LOAD_MM_PERMUTATION 1 ; name to load from | 1095 %macro LOAD_MM_PERMUTATION 1 ; name to load from |
| 896 %ifdef %1_m0 | 1096 %ifdef %1_m0 |
| 897 %assign %%i 0 | 1097 %assign %%i 0 |
| 898 %rep num_mmregs | 1098 %rep num_mmregs |
| 899 CAT_XDEFINE m, %%i, %1_m %+ %%i | 1099 CAT_XDEFINE m, %%i, %1_m %+ %%i |
| 900 CAT_XDEFINE n, m %+ %%i, %%i | 1100 CAT_XDEFINE nn, m %+ %%i, %%i |
| 901 %assign %%i %%i+1 | 1101 %assign %%i %%i+1 |
| 902 %endrep | 1102 %endrep |
| 903 %endif | 1103 %endif |
| 904 %endmacro | 1104 %endmacro |
| 905 | 1105 |
| 906 ; Append cpuflags to the callee's name iff the appended name is known and the pl
ain name isn't | 1106 ; Append cpuflags to the callee's name iff the appended name is known and the pl
ain name isn't |
| 907 %macro call 1 | 1107 %macro call 1 |
| 908 call_internal %1, %1 %+ SUFFIX | 1108 call_internal %1, %1 %+ SUFFIX |
| 909 %endmacro | 1109 %endmacro |
| 910 %macro call_internal 2 | 1110 %macro call_internal 2 |
| (...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 951 %rep 16 | 1151 %rep 16 |
| 952 %if i < 8 | 1152 %if i < 8 |
| 953 CAT_XDEFINE sizeofmm, i, 8 | 1153 CAT_XDEFINE sizeofmm, i, 8 |
| 954 %endif | 1154 %endif |
| 955 CAT_XDEFINE sizeofxmm, i, 16 | 1155 CAT_XDEFINE sizeofxmm, i, 16 |
| 956 CAT_XDEFINE sizeofymm, i, 32 | 1156 CAT_XDEFINE sizeofymm, i, 32 |
| 957 %assign i i+1 | 1157 %assign i i+1 |
| 958 %endrep | 1158 %endrep |
| 959 %undef i | 1159 %undef i |
| 960 | 1160 |
| 1161 %macro CHECK_AVX_INSTR_EMU 3-* |
| 1162 %xdefine %%opcode %1 |
| 1163 %xdefine %%dst %2 |
| 1164 %rep %0-2 |
| 1165 %ifidn %%dst, %3 |
| 1166 %error non-avx emulation of ``%%opcode'' is not supported |
| 1167 %endif |
| 1168 %rotate 1 |
| 1169 %endrep |
| 1170 %endmacro |
| 1171 |
| 961 ;%1 == instruction | 1172 ;%1 == instruction |
| 962 ;%2 == 1 if float, 0 if int | 1173 ;%2 == minimal instruction set |
| 963 ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) | 1174 ;%3 == 1 if float, 0 if int |
| 964 ;%4 == number of operands given | 1175 ;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise |
| 965 ;%5+: operands | 1176 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not |
| 966 %macro RUN_AVX_INSTR 6-7+ | 1177 ;%6+: operands |
| 967 %ifid %5 | 1178 %macro RUN_AVX_INSTR 6-9+ |
| 968 %define %%size sizeof%5 | 1179 %ifnum sizeof%7 |
| 1180 %assign __sizeofreg sizeof%7 |
| 1181 %elifnum sizeof%6 |
| 1182 %assign __sizeofreg sizeof%6 |
| 969 %else | 1183 %else |
| 970 %define %%size mmsize | 1184 %assign __sizeofreg mmsize |
| 971 %endif | 1185 %endif |
| 972 %if %%size==32 | 1186 %assign __emulate_avx 0 |
| 973 %if %0 >= 7 | 1187 %if avx_enabled && __sizeofreg >= 16 |
| 974 v%1 %5, %6, %7 | 1188 %xdefine __instr v%1 |
| 1189 %else |
| 1190 %xdefine __instr %1 |
| 1191 %if %0 >= 8+%4 |
| 1192 %assign __emulate_avx 1 |
| 1193 %endif |
| 1194 %endif |
| 1195 %ifnidn %2, fnord |
| 1196 %ifdef cpuname |
| 1197 %if notcpuflag(%2) |
| 1198 %error use of ``%1'' %2 instruction in cpuname function: current
_function |
| 1199 %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg
> 8 |
| 1200 %error use of ``%1'' sse2 instruction in cpuname function: curre
nt_function |
| 1201 %endif |
| 1202 %endif |
| 1203 %endif |
| 1204 |
| 1205 %if __emulate_avx |
| 1206 %xdefine __src1 %7 |
| 1207 %xdefine __src2 %8 |
| 1208 %ifnidn %6, %7 |
| 1209 %if %0 >= 9 |
| 1210 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9 |
| 1211 %else |
| 1212 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8 |
| 1213 %endif |
| 1214 %if %5 && %4 == 0 |
| 1215 %ifnid %8 |
| 1216 ; 3-operand AVX instructions with a memory arg can only have
it in src2, |
| 1217 ; whereas SSE emulation prefers to have it in src1 (i.e. the
mov). |
| 1218 ; So, if the instruction is commutative with a memory arg, s
wap them. |
| 1219 %xdefine __src1 %8 |
| 1220 %xdefine __src2 %7 |
| 1221 %endif |
| 1222 %endif |
| 1223 %if __sizeofreg == 8 |
| 1224 MOVQ %6, __src1 |
| 1225 %elif %3 |
| 1226 MOVAPS %6, __src1 |
| 1227 %else |
| 1228 MOVDQA %6, __src1 |
| 1229 %endif |
| 1230 %endif |
| 1231 %if %0 >= 9 |
| 1232 %1 %6, __src2, %9 |
| 975 %else | 1233 %else |
| 976 v%1 %5, %6 | 1234 %1 %6, __src2 |
| 977 %endif | 1235 %endif |
| 1236 %elif %0 >= 9 |
| 1237 __instr %6, %7, %8, %9 |
| 1238 %elif %0 == 8 |
| 1239 __instr %6, %7, %8 |
| 1240 %elif %0 == 7 |
| 1241 __instr %6, %7 |
| 978 %else | 1242 %else |
| 979 %if %%size==8 | 1243 __instr %6 |
| 980 %define %%regmov movq | |
| 981 %elif %2 | |
| 982 %define %%regmov movaps | |
| 983 %else | |
| 984 %define %%regmov movdqa | |
| 985 %endif | |
| 986 | |
| 987 %if %4>=3+%3 | |
| 988 %ifnidn %5, %6 | |
| 989 %if avx_enabled && sizeof%5==16 | |
| 990 v%1 %5, %6, %7 | |
| 991 %else | |
| 992 %%regmov %5, %6 | |
| 993 %1 %5, %7 | |
| 994 %endif | |
| 995 %else | |
| 996 %1 %5, %7 | |
| 997 %endif | |
| 998 %elif %3 | |
| 999 %1 %5, %6, %7 | |
| 1000 %else | |
| 1001 %1 %5, %6 | |
| 1002 %endif | |
| 1003 %endif | 1244 %endif |
| 1004 %endmacro | 1245 %endmacro |
| 1005 | 1246 |
| 1006 ; 3arg AVX ops with a memory arg can only have it in src2, | |
| 1007 ; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). | |
| 1008 ; So, if the op is symmetric and the wrong one is memory, swap them. | |
| 1009 %macro RUN_AVX_INSTR1 8 | |
| 1010 %assign %%swap 0 | |
| 1011 %if avx_enabled | |
| 1012 %ifnid %6 | |
| 1013 %assign %%swap 1 | |
| 1014 %endif | |
| 1015 %elifnidn %5, %6 | |
| 1016 %ifnid %7 | |
| 1017 %assign %%swap 1 | |
| 1018 %endif | |
| 1019 %endif | |
| 1020 %if %%swap && %3 == 0 && %8 == 1 | |
| 1021 RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 | |
| 1022 %else | |
| 1023 RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 | |
| 1024 %endif | |
| 1025 %endmacro | |
| 1026 | |
| 1027 ;%1 == instruction | 1247 ;%1 == instruction |
| 1028 ;%2 == 1 if float, 0 if int | 1248 ;%2 == minimal instruction set |
| 1029 ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm) | 1249 ;%3 == 1 if float, 0 if int |
| 1030 ;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not | 1250 ;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise |
| 1031 %macro AVX_INSTR 4 | 1251 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not |
| 1032 %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 | 1252 %macro AVX_INSTR 1-5 fnord, 0, 1, 0 |
| 1033 %ifidn %3, fnord | 1253 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 |
| 1034 RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 | 1254 %ifidn %2, fnord |
| 1255 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 |
| 1256 %elifidn %3, fnord |
| 1257 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 |
| 1035 %elifidn %4, fnord | 1258 %elifidn %4, fnord |
| 1036 RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 | 1259 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 |
| 1037 %elifidn %5, fnord | 1260 %elifidn %5, fnord |
| 1038 RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 | 1261 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 |
| 1039 %else | 1262 %else |
| 1040 RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 | 1263 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 |
| 1041 %endif | 1264 %endif |
| 1042 %endmacro | 1265 %endmacro |
| 1043 %endmacro | 1266 %endmacro |
| 1044 | 1267 |
| 1045 AVX_INSTR addpd, 1, 0, 1 | 1268 ; Instructions with both VEX and non-VEX encodings |
| 1046 AVX_INSTR addps, 1, 0, 1 | 1269 ; Non-destructive instructions are written without parameters |
| 1047 AVX_INSTR addsd, 1, 0, 1 | 1270 AVX_INSTR addpd, sse2, 1, 0, 1 |
| 1048 AVX_INSTR addss, 1, 0, 1 | 1271 AVX_INSTR addps, sse, 1, 0, 1 |
| 1049 AVX_INSTR addsubpd, 1, 0, 0 | 1272 AVX_INSTR addsd, sse2, 1, 0, 1 |
| 1050 AVX_INSTR addsubps, 1, 0, 0 | 1273 AVX_INSTR addss, sse, 1, 0, 1 |
| 1051 AVX_INSTR andpd, 1, 0, 1 | 1274 AVX_INSTR addsubpd, sse3, 1, 0, 0 |
| 1052 AVX_INSTR andps, 1, 0, 1 | 1275 AVX_INSTR addsubps, sse3, 1, 0, 0 |
| 1053 AVX_INSTR andnpd, 1, 0, 0 | 1276 AVX_INSTR aesdec, fnord, 0, 0, 0 |
| 1054 AVX_INSTR andnps, 1, 0, 0 | 1277 AVX_INSTR aesdeclast, fnord, 0, 0, 0 |
| 1055 AVX_INSTR blendpd, 1, 0, 0 | 1278 AVX_INSTR aesenc, fnord, 0, 0, 0 |
| 1056 AVX_INSTR blendps, 1, 0, 0 | 1279 AVX_INSTR aesenclast, fnord, 0, 0, 0 |
| 1057 AVX_INSTR blendvpd, 1, 0, 0 | 1280 AVX_INSTR aesimc |
| 1058 AVX_INSTR blendvps, 1, 0, 0 | 1281 AVX_INSTR aeskeygenassist |
| 1059 AVX_INSTR cmppd, 1, 0, 0 | 1282 AVX_INSTR andnpd, sse2, 1, 0, 0 |
| 1060 AVX_INSTR cmpps, 1, 0, 0 | 1283 AVX_INSTR andnps, sse, 1, 0, 0 |
| 1061 AVX_INSTR cmpsd, 1, 0, 0 | 1284 AVX_INSTR andpd, sse2, 1, 0, 1 |
| 1062 AVX_INSTR cmpss, 1, 0, 0 | 1285 AVX_INSTR andps, sse, 1, 0, 1 |
| 1063 AVX_INSTR cvtdq2ps, 1, 0, 0 | 1286 AVX_INSTR blendpd, sse4, 1, 0, 0 |
| 1064 AVX_INSTR cvtps2dq, 1, 0, 0 | 1287 AVX_INSTR blendps, sse4, 1, 0, 0 |
| 1065 AVX_INSTR divpd, 1, 0, 0 | 1288 AVX_INSTR blendvpd, sse4, 1, 0, 0 |
| 1066 AVX_INSTR divps, 1, 0, 0 | 1289 AVX_INSTR blendvps, sse4, 1, 0, 0 |
| 1067 AVX_INSTR divsd, 1, 0, 0 | 1290 AVX_INSTR cmppd, sse2, 1, 1, 0 |
| 1068 AVX_INSTR divss, 1, 0, 0 | 1291 AVX_INSTR cmpps, sse, 1, 1, 0 |
| 1069 AVX_INSTR dppd, 1, 1, 0 | 1292 AVX_INSTR cmpsd, sse2, 1, 1, 0 |
| 1070 AVX_INSTR dpps, 1, 1, 0 | 1293 AVX_INSTR cmpss, sse, 1, 1, 0 |
| 1071 AVX_INSTR haddpd, 1, 0, 0 | 1294 AVX_INSTR comisd, sse2 |
| 1072 AVX_INSTR haddps, 1, 0, 0 | 1295 AVX_INSTR comiss, sse |
| 1073 AVX_INSTR hsubpd, 1, 0, 0 | 1296 AVX_INSTR cvtdq2pd, sse2 |
| 1074 AVX_INSTR hsubps, 1, 0, 0 | 1297 AVX_INSTR cvtdq2ps, sse2 |
| 1075 AVX_INSTR maxpd, 1, 0, 1 | 1298 AVX_INSTR cvtpd2dq, sse2 |
| 1076 AVX_INSTR maxps, 1, 0, 1 | 1299 AVX_INSTR cvtpd2ps, sse2 |
| 1077 AVX_INSTR maxsd, 1, 0, 1 | 1300 AVX_INSTR cvtps2dq, sse2 |
| 1078 AVX_INSTR maxss, 1, 0, 1 | 1301 AVX_INSTR cvtps2pd, sse2 |
| 1079 AVX_INSTR minpd, 1, 0, 1 | 1302 AVX_INSTR cvtsd2si, sse2 |
| 1080 AVX_INSTR minps, 1, 0, 1 | 1303 AVX_INSTR cvtsd2ss, sse2 |
| 1081 AVX_INSTR minsd, 1, 0, 1 | 1304 AVX_INSTR cvtsi2sd, sse2 |
| 1082 AVX_INSTR minss, 1, 0, 1 | 1305 AVX_INSTR cvtsi2ss, sse |
| 1083 AVX_INSTR movhlps, 1, 0, 0 | 1306 AVX_INSTR cvtss2sd, sse2 |
| 1084 AVX_INSTR movlhps, 1, 0, 0 | 1307 AVX_INSTR cvtss2si, sse |
| 1085 AVX_INSTR movsd, 1, 0, 0 | 1308 AVX_INSTR cvttpd2dq, sse2 |
| 1086 AVX_INSTR movss, 1, 0, 0 | 1309 AVX_INSTR cvttps2dq, sse2 |
| 1087 AVX_INSTR mpsadbw, 0, 1, 0 | 1310 AVX_INSTR cvttsd2si, sse2 |
| 1088 AVX_INSTR mulpd, 1, 0, 1 | 1311 AVX_INSTR cvttss2si, sse |
| 1089 AVX_INSTR mulps, 1, 0, 1 | 1312 AVX_INSTR divpd, sse2, 1, 0, 0 |
| 1090 AVX_INSTR mulsd, 1, 0, 1 | 1313 AVX_INSTR divps, sse, 1, 0, 0 |
| 1091 AVX_INSTR mulss, 1, 0, 1 | 1314 AVX_INSTR divsd, sse2, 1, 0, 0 |
| 1092 AVX_INSTR orpd, 1, 0, 1 | 1315 AVX_INSTR divss, sse, 1, 0, 0 |
| 1093 AVX_INSTR orps, 1, 0, 1 | 1316 AVX_INSTR dppd, sse4, 1, 1, 0 |
| 1094 AVX_INSTR packsswb, 0, 0, 0 | 1317 AVX_INSTR dpps, sse4, 1, 1, 0 |
| 1095 AVX_INSTR packssdw, 0, 0, 0 | 1318 AVX_INSTR extractps, sse4 |
| 1096 AVX_INSTR packuswb, 0, 0, 0 | 1319 AVX_INSTR haddpd, sse3, 1, 0, 0 |
| 1097 AVX_INSTR packusdw, 0, 0, 0 | 1320 AVX_INSTR haddps, sse3, 1, 0, 0 |
| 1098 AVX_INSTR paddb, 0, 0, 1 | 1321 AVX_INSTR hsubpd, sse3, 1, 0, 0 |
| 1099 AVX_INSTR paddw, 0, 0, 1 | 1322 AVX_INSTR hsubps, sse3, 1, 0, 0 |
| 1100 AVX_INSTR paddd, 0, 0, 1 | 1323 AVX_INSTR insertps, sse4, 1, 1, 0 |
| 1101 AVX_INSTR paddq, 0, 0, 1 | 1324 AVX_INSTR lddqu, sse3 |
| 1102 AVX_INSTR paddsb, 0, 0, 1 | 1325 AVX_INSTR ldmxcsr, sse |
| 1103 AVX_INSTR paddsw, 0, 0, 1 | 1326 AVX_INSTR maskmovdqu, sse2 |
| 1104 AVX_INSTR paddusb, 0, 0, 1 | 1327 AVX_INSTR maxpd, sse2, 1, 0, 1 |
| 1105 AVX_INSTR paddusw, 0, 0, 1 | 1328 AVX_INSTR maxps, sse, 1, 0, 1 |
| 1106 AVX_INSTR palignr, 0, 1, 0 | 1329 AVX_INSTR maxsd, sse2, 1, 0, 1 |
| 1107 AVX_INSTR pand, 0, 0, 1 | 1330 AVX_INSTR maxss, sse, 1, 0, 1 |
| 1108 AVX_INSTR pandn, 0, 0, 0 | 1331 AVX_INSTR minpd, sse2, 1, 0, 1 |
| 1109 AVX_INSTR pavgb, 0, 0, 1 | 1332 AVX_INSTR minps, sse, 1, 0, 1 |
| 1110 AVX_INSTR pavgw, 0, 0, 1 | 1333 AVX_INSTR minsd, sse2, 1, 0, 1 |
| 1111 AVX_INSTR pblendvb, 0, 0, 0 | 1334 AVX_INSTR minss, sse, 1, 0, 1 |
| 1112 AVX_INSTR pblendw, 0, 1, 0 | 1335 AVX_INSTR movapd, sse2 |
| 1113 AVX_INSTR pcmpestri, 0, 0, 0 | 1336 AVX_INSTR movaps, sse |
| 1114 AVX_INSTR pcmpestrm, 0, 0, 0 | 1337 AVX_INSTR movd, mmx |
| 1115 AVX_INSTR pcmpistri, 0, 0, 0 | 1338 AVX_INSTR movddup, sse3 |
| 1116 AVX_INSTR pcmpistrm, 0, 0, 0 | 1339 AVX_INSTR movdqa, sse2 |
| 1117 AVX_INSTR pcmpeqb, 0, 0, 1 | 1340 AVX_INSTR movdqu, sse2 |
| 1118 AVX_INSTR pcmpeqw, 0, 0, 1 | 1341 AVX_INSTR movhlps, sse, 1, 0, 0 |
| 1119 AVX_INSTR pcmpeqd, 0, 0, 1 | 1342 AVX_INSTR movhpd, sse2, 1, 0, 0 |
| 1120 AVX_INSTR pcmpeqq, 0, 0, 1 | 1343 AVX_INSTR movhps, sse, 1, 0, 0 |
| 1121 AVX_INSTR pcmpgtb, 0, 0, 0 | 1344 AVX_INSTR movlhps, sse, 1, 0, 0 |
| 1122 AVX_INSTR pcmpgtw, 0, 0, 0 | 1345 AVX_INSTR movlpd, sse2, 1, 0, 0 |
| 1123 AVX_INSTR pcmpgtd, 0, 0, 0 | 1346 AVX_INSTR movlps, sse, 1, 0, 0 |
| 1124 AVX_INSTR pcmpgtq, 0, 0, 0 | 1347 AVX_INSTR movmskpd, sse2 |
| 1125 AVX_INSTR phaddw, 0, 0, 0 | 1348 AVX_INSTR movmskps, sse |
| 1126 AVX_INSTR phaddd, 0, 0, 0 | 1349 AVX_INSTR movntdq, sse2 |
| 1127 AVX_INSTR phaddsw, 0, 0, 0 | 1350 AVX_INSTR movntdqa, sse4 |
| 1128 AVX_INSTR phsubw, 0, 0, 0 | 1351 AVX_INSTR movntpd, sse2 |
| 1129 AVX_INSTR phsubd, 0, 0, 0 | 1352 AVX_INSTR movntps, sse |
| 1130 AVX_INSTR phsubsw, 0, 0, 0 | 1353 AVX_INSTR movq, mmx |
| 1131 AVX_INSTR pmaddwd, 0, 0, 1 | 1354 AVX_INSTR movsd, sse2, 1, 0, 0 |
| 1132 AVX_INSTR pmaddubsw, 0, 0, 0 | 1355 AVX_INSTR movshdup, sse3 |
| 1133 AVX_INSTR pmaxsb, 0, 0, 1 | 1356 AVX_INSTR movsldup, sse3 |
| 1134 AVX_INSTR pmaxsw, 0, 0, 1 | 1357 AVX_INSTR movss, sse, 1, 0, 0 |
| 1135 AVX_INSTR pmaxsd, 0, 0, 1 | 1358 AVX_INSTR movupd, sse2 |
| 1136 AVX_INSTR pmaxub, 0, 0, 1 | 1359 AVX_INSTR movups, sse |
| 1137 AVX_INSTR pmaxuw, 0, 0, 1 | 1360 AVX_INSTR mpsadbw, sse4 |
| 1138 AVX_INSTR pmaxud, 0, 0, 1 | 1361 AVX_INSTR mulpd, sse2, 1, 0, 1 |
| 1139 AVX_INSTR pminsb, 0, 0, 1 | 1362 AVX_INSTR mulps, sse, 1, 0, 1 |
| 1140 AVX_INSTR pminsw, 0, 0, 1 | 1363 AVX_INSTR mulsd, sse2, 1, 0, 1 |
| 1141 AVX_INSTR pminsd, 0, 0, 1 | 1364 AVX_INSTR mulss, sse, 1, 0, 1 |
| 1142 AVX_INSTR pminub, 0, 0, 1 | 1365 AVX_INSTR orpd, sse2, 1, 0, 1 |
| 1143 AVX_INSTR pminuw, 0, 0, 1 | 1366 AVX_INSTR orps, sse, 1, 0, 1 |
| 1144 AVX_INSTR pminud, 0, 0, 1 | 1367 AVX_INSTR pabsb, ssse3 |
| 1145 AVX_INSTR pmulhuw, 0, 0, 1 | 1368 AVX_INSTR pabsd, ssse3 |
| 1146 AVX_INSTR pmulhrsw, 0, 0, 1 | 1369 AVX_INSTR pabsw, ssse3 |
| 1147 AVX_INSTR pmulhw, 0, 0, 1 | 1370 AVX_INSTR packsswb, mmx, 0, 0, 0 |
| 1148 AVX_INSTR pmullw, 0, 0, 1 | 1371 AVX_INSTR packssdw, mmx, 0, 0, 0 |
| 1149 AVX_INSTR pmulld, 0, 0, 1 | 1372 AVX_INSTR packuswb, mmx, 0, 0, 0 |
| 1150 AVX_INSTR pmuludq, 0, 0, 1 | 1373 AVX_INSTR packusdw, sse4, 0, 0, 0 |
| 1151 AVX_INSTR pmuldq, 0, 0, 1 | 1374 AVX_INSTR paddb, mmx, 0, 0, 1 |
| 1152 AVX_INSTR por, 0, 0, 1 | 1375 AVX_INSTR paddw, mmx, 0, 0, 1 |
| 1153 AVX_INSTR psadbw, 0, 0, 1 | 1376 AVX_INSTR paddd, mmx, 0, 0, 1 |
| 1154 AVX_INSTR pshufb, 0, 0, 0 | 1377 AVX_INSTR paddq, sse2, 0, 0, 1 |
| 1155 AVX_INSTR psignb, 0, 0, 0 | 1378 AVX_INSTR paddsb, mmx, 0, 0, 1 |
| 1156 AVX_INSTR psignw, 0, 0, 0 | 1379 AVX_INSTR paddsw, mmx, 0, 0, 1 |
| 1157 AVX_INSTR psignd, 0, 0, 0 | 1380 AVX_INSTR paddusb, mmx, 0, 0, 1 |
| 1158 AVX_INSTR psllw, 0, 0, 0 | 1381 AVX_INSTR paddusw, mmx, 0, 0, 1 |
| 1159 AVX_INSTR pslld, 0, 0, 0 | 1382 AVX_INSTR palignr, ssse3 |
| 1160 AVX_INSTR psllq, 0, 0, 0 | 1383 AVX_INSTR pand, mmx, 0, 0, 1 |
| 1161 AVX_INSTR pslldq, 0, 0, 0 | 1384 AVX_INSTR pandn, mmx, 0, 0, 0 |
| 1162 AVX_INSTR psraw, 0, 0, 0 | 1385 AVX_INSTR pavgb, mmx2, 0, 0, 1 |
| 1163 AVX_INSTR psrad, 0, 0, 0 | 1386 AVX_INSTR pavgw, mmx2, 0, 0, 1 |
| 1164 AVX_INSTR psrlw, 0, 0, 0 | 1387 AVX_INSTR pblendvb, sse4, 0, 0, 0 |
| 1165 AVX_INSTR psrld, 0, 0, 0 | 1388 AVX_INSTR pblendw, sse4 |
| 1166 AVX_INSTR psrlq, 0, 0, 0 | 1389 AVX_INSTR pclmulqdq |
| 1167 AVX_INSTR psrldq, 0, 0, 0 | 1390 AVX_INSTR pcmpestri, sse42 |
| 1168 AVX_INSTR psubb, 0, 0, 0 | 1391 AVX_INSTR pcmpestrm, sse42 |
| 1169 AVX_INSTR psubw, 0, 0, 0 | 1392 AVX_INSTR pcmpistri, sse42 |
| 1170 AVX_INSTR psubd, 0, 0, 0 | 1393 AVX_INSTR pcmpistrm, sse42 |
| 1171 AVX_INSTR psubq, 0, 0, 0 | 1394 AVX_INSTR pcmpeqb, mmx, 0, 0, 1 |
| 1172 AVX_INSTR psubsb, 0, 0, 0 | 1395 AVX_INSTR pcmpeqw, mmx, 0, 0, 1 |
| 1173 AVX_INSTR psubsw, 0, 0, 0 | 1396 AVX_INSTR pcmpeqd, mmx, 0, 0, 1 |
| 1174 AVX_INSTR psubusb, 0, 0, 0 | 1397 AVX_INSTR pcmpeqq, sse4, 0, 0, 1 |
| 1175 AVX_INSTR psubusw, 0, 0, 0 | 1398 AVX_INSTR pcmpgtb, mmx, 0, 0, 0 |
| 1176 AVX_INSTR punpckhbw, 0, 0, 0 | 1399 AVX_INSTR pcmpgtw, mmx, 0, 0, 0 |
| 1177 AVX_INSTR punpckhwd, 0, 0, 0 | 1400 AVX_INSTR pcmpgtd, mmx, 0, 0, 0 |
| 1178 AVX_INSTR punpckhdq, 0, 0, 0 | 1401 AVX_INSTR pcmpgtq, sse42, 0, 0, 0 |
| 1179 AVX_INSTR punpckhqdq, 0, 0, 0 | 1402 AVX_INSTR pextrb, sse4 |
| 1180 AVX_INSTR punpcklbw, 0, 0, 0 | 1403 AVX_INSTR pextrd, sse4 |
| 1181 AVX_INSTR punpcklwd, 0, 0, 0 | 1404 AVX_INSTR pextrq, sse4 |
| 1182 AVX_INSTR punpckldq, 0, 0, 0 | 1405 AVX_INSTR pextrw, mmx2 |
| 1183 AVX_INSTR punpcklqdq, 0, 0, 0 | 1406 AVX_INSTR phaddw, ssse3, 0, 0, 0 |
| 1184 AVX_INSTR pxor, 0, 0, 1 | 1407 AVX_INSTR phaddd, ssse3, 0, 0, 0 |
| 1185 AVX_INSTR shufps, 1, 1, 0 | 1408 AVX_INSTR phaddsw, ssse3, 0, 0, 0 |
| 1186 AVX_INSTR subpd, 1, 0, 0 | 1409 AVX_INSTR phminposuw, sse4 |
| 1187 AVX_INSTR subps, 1, 0, 0 | 1410 AVX_INSTR phsubw, ssse3, 0, 0, 0 |
| 1188 AVX_INSTR subsd, 1, 0, 0 | 1411 AVX_INSTR phsubd, ssse3, 0, 0, 0 |
| 1189 AVX_INSTR subss, 1, 0, 0 | 1412 AVX_INSTR phsubsw, ssse3, 0, 0, 0 |
| 1190 AVX_INSTR unpckhpd, 1, 0, 0 | 1413 AVX_INSTR pinsrb, sse4 |
| 1191 AVX_INSTR unpckhps, 1, 0, 0 | 1414 AVX_INSTR pinsrd, sse4 |
| 1192 AVX_INSTR unpcklpd, 1, 0, 0 | 1415 AVX_INSTR pinsrq, sse4 |
| 1193 AVX_INSTR unpcklps, 1, 0, 0 | 1416 AVX_INSTR pinsrw, mmx2 |
| 1194 AVX_INSTR xorpd, 1, 0, 1 | 1417 AVX_INSTR pmaddwd, mmx, 0, 0, 1 |
| 1195 AVX_INSTR xorps, 1, 0, 1 | 1418 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 |
| 1419 AVX_INSTR pmaxsb, sse4, 0, 0, 1 |
| 1420 AVX_INSTR pmaxsw, mmx2, 0, 0, 1 |
| 1421 AVX_INSTR pmaxsd, sse4, 0, 0, 1 |
| 1422 AVX_INSTR pmaxub, mmx2, 0, 0, 1 |
| 1423 AVX_INSTR pmaxuw, sse4, 0, 0, 1 |
| 1424 AVX_INSTR pmaxud, sse4, 0, 0, 1 |
| 1425 AVX_INSTR pminsb, sse4, 0, 0, 1 |
| 1426 AVX_INSTR pminsw, mmx2, 0, 0, 1 |
| 1427 AVX_INSTR pminsd, sse4, 0, 0, 1 |
| 1428 AVX_INSTR pminub, mmx2, 0, 0, 1 |
| 1429 AVX_INSTR pminuw, sse4, 0, 0, 1 |
| 1430 AVX_INSTR pminud, sse4, 0, 0, 1 |
| 1431 AVX_INSTR pmovmskb, mmx2 |
| 1432 AVX_INSTR pmovsxbw, sse4 |
| 1433 AVX_INSTR pmovsxbd, sse4 |
| 1434 AVX_INSTR pmovsxbq, sse4 |
| 1435 AVX_INSTR pmovsxwd, sse4 |
| 1436 AVX_INSTR pmovsxwq, sse4 |
| 1437 AVX_INSTR pmovsxdq, sse4 |
| 1438 AVX_INSTR pmovzxbw, sse4 |
| 1439 AVX_INSTR pmovzxbd, sse4 |
| 1440 AVX_INSTR pmovzxbq, sse4 |
| 1441 AVX_INSTR pmovzxwd, sse4 |
| 1442 AVX_INSTR pmovzxwq, sse4 |
| 1443 AVX_INSTR pmovzxdq, sse4 |
| 1444 AVX_INSTR pmuldq, sse4, 0, 0, 1 |
| 1445 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 |
| 1446 AVX_INSTR pmulhuw, mmx2, 0, 0, 1 |
| 1447 AVX_INSTR pmulhw, mmx, 0, 0, 1 |
| 1448 AVX_INSTR pmullw, mmx, 0, 0, 1 |
| 1449 AVX_INSTR pmulld, sse4, 0, 0, 1 |
| 1450 AVX_INSTR pmuludq, sse2, 0, 0, 1 |
| 1451 AVX_INSTR por, mmx, 0, 0, 1 |
| 1452 AVX_INSTR psadbw, mmx2, 0, 0, 1 |
| 1453 AVX_INSTR pshufb, ssse3, 0, 0, 0 |
| 1454 AVX_INSTR pshufd, sse2 |
| 1455 AVX_INSTR pshufhw, sse2 |
| 1456 AVX_INSTR pshuflw, sse2 |
| 1457 AVX_INSTR psignb, ssse3, 0, 0, 0 |
| 1458 AVX_INSTR psignw, ssse3, 0, 0, 0 |
| 1459 AVX_INSTR psignd, ssse3, 0, 0, 0 |
| 1460 AVX_INSTR psllw, mmx, 0, 0, 0 |
| 1461 AVX_INSTR pslld, mmx, 0, 0, 0 |
| 1462 AVX_INSTR psllq, mmx, 0, 0, 0 |
| 1463 AVX_INSTR pslldq, sse2, 0, 0, 0 |
| 1464 AVX_INSTR psraw, mmx, 0, 0, 0 |
| 1465 AVX_INSTR psrad, mmx, 0, 0, 0 |
| 1466 AVX_INSTR psrlw, mmx, 0, 0, 0 |
| 1467 AVX_INSTR psrld, mmx, 0, 0, 0 |
| 1468 AVX_INSTR psrlq, mmx, 0, 0, 0 |
| 1469 AVX_INSTR psrldq, sse2, 0, 0, 0 |
| 1470 AVX_INSTR psubb, mmx, 0, 0, 0 |
| 1471 AVX_INSTR psubw, mmx, 0, 0, 0 |
| 1472 AVX_INSTR psubd, mmx, 0, 0, 0 |
| 1473 AVX_INSTR psubq, sse2, 0, 0, 0 |
| 1474 AVX_INSTR psubsb, mmx, 0, 0, 0 |
| 1475 AVX_INSTR psubsw, mmx, 0, 0, 0 |
| 1476 AVX_INSTR psubusb, mmx, 0, 0, 0 |
| 1477 AVX_INSTR psubusw, mmx, 0, 0, 0 |
| 1478 AVX_INSTR ptest, sse4 |
| 1479 AVX_INSTR punpckhbw, mmx, 0, 0, 0 |
| 1480 AVX_INSTR punpckhwd, mmx, 0, 0, 0 |
| 1481 AVX_INSTR punpckhdq, mmx, 0, 0, 0 |
| 1482 AVX_INSTR punpckhqdq, sse2, 0, 0, 0 |
| 1483 AVX_INSTR punpcklbw, mmx, 0, 0, 0 |
| 1484 AVX_INSTR punpcklwd, mmx, 0, 0, 0 |
| 1485 AVX_INSTR punpckldq, mmx, 0, 0, 0 |
| 1486 AVX_INSTR punpcklqdq, sse2, 0, 0, 0 |
| 1487 AVX_INSTR pxor, mmx, 0, 0, 1 |
| 1488 AVX_INSTR rcpps, sse, 1, 0, 0 |
| 1489 AVX_INSTR rcpss, sse, 1, 0, 0 |
| 1490 AVX_INSTR roundpd, sse4 |
| 1491 AVX_INSTR roundps, sse4 |
| 1492 AVX_INSTR roundsd, sse4 |
| 1493 AVX_INSTR roundss, sse4 |
| 1494 AVX_INSTR rsqrtps, sse, 1, 0, 0 |
| 1495 AVX_INSTR rsqrtss, sse, 1, 0, 0 |
| 1496 AVX_INSTR shufpd, sse2, 1, 1, 0 |
| 1497 AVX_INSTR shufps, sse, 1, 1, 0 |
| 1498 AVX_INSTR sqrtpd, sse2, 1, 0, 0 |
| 1499 AVX_INSTR sqrtps, sse, 1, 0, 0 |
| 1500 AVX_INSTR sqrtsd, sse2, 1, 0, 0 |
| 1501 AVX_INSTR sqrtss, sse, 1, 0, 0 |
| 1502 AVX_INSTR stmxcsr, sse |
| 1503 AVX_INSTR subpd, sse2, 1, 0, 0 |
| 1504 AVX_INSTR subps, sse, 1, 0, 0 |
| 1505 AVX_INSTR subsd, sse2, 1, 0, 0 |
| 1506 AVX_INSTR subss, sse, 1, 0, 0 |
| 1507 AVX_INSTR ucomisd, sse2 |
| 1508 AVX_INSTR ucomiss, sse |
| 1509 AVX_INSTR unpckhpd, sse2, 1, 0, 0 |
| 1510 AVX_INSTR unpckhps, sse, 1, 0, 0 |
| 1511 AVX_INSTR unpcklpd, sse2, 1, 0, 0 |
| 1512 AVX_INSTR unpcklps, sse, 1, 0, 0 |
| 1513 AVX_INSTR xorpd, sse2, 1, 0, 1 |
| 1514 AVX_INSTR xorps, sse, 1, 0, 1 |
| 1196 | 1515 |
| 1197 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN | 1516 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN |
| 1198 AVX_INSTR pfadd, 1, 0, 1 | 1517 AVX_INSTR pfadd, 3dnow, 1, 0, 1 |
| 1199 AVX_INSTR pfsub, 1, 0, 0 | 1518 AVX_INSTR pfsub, 3dnow, 1, 0, 0 |
| 1200 AVX_INSTR pfmul, 1, 0, 1 | 1519 AVX_INSTR pfmul, 3dnow, 1, 0, 1 |
| 1201 | 1520 |
| 1202 ; base-4 constants for shuffles | 1521 ; base-4 constants for shuffles |
| 1203 %assign i 0 | 1522 %assign i 0 |
| 1204 %rep 256 | 1523 %rep 256 |
| 1205 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) | 1524 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) |
| 1206 %if j < 10 | 1525 %if j < 10 |
| 1207 CAT_XDEFINE q000, j, i | 1526 CAT_XDEFINE q000, j, i |
| 1208 %elif j < 100 | 1527 %elif j < 100 |
| 1209 CAT_XDEFINE q00, j, i | 1528 CAT_XDEFINE q00, j, i |
| 1210 %elif j < 1000 | 1529 %elif j < 1000 |
| 1211 CAT_XDEFINE q0, j, i | 1530 CAT_XDEFINE q0, j, i |
| 1212 %else | 1531 %else |
| 1213 CAT_XDEFINE q, j, i | 1532 CAT_XDEFINE q, j, i |
| 1214 %endif | 1533 %endif |
| 1215 %assign i i+1 | 1534 %assign i i+1 |
| 1216 %endrep | 1535 %endrep |
| 1217 %undef i | 1536 %undef i |
| 1218 %undef j | 1537 %undef j |
| 1219 | 1538 |
| 1220 %macro FMA_INSTR 3 | 1539 %macro FMA_INSTR 3 |
| 1221 %macro %1 4-7 %1, %2, %3 | 1540 %macro %1 4-7 %1, %2, %3 |
| 1222 %if cpuflag(xop) | 1541 %if cpuflag(xop) |
| 1223 v%5 %1, %2, %3, %4 | 1542 v%5 %1, %2, %3, %4 |
| 1224 %else | 1543 %elifnidn %1, %4 |
| 1225 %6 %1, %2, %3 | 1544 %6 %1, %2, %3 |
| 1226 %7 %1, %4 | 1545 %7 %1, %4 |
| 1546 %else |
| 1547 %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported |
| 1227 %endif | 1548 %endif |
| 1228 %endmacro | 1549 %endmacro |
| 1229 %endmacro | 1550 %endmacro |
| 1230 | 1551 |
| 1231 FMA_INSTR pmacsdd, pmulld, paddd | |
| 1232 FMA_INSTR pmacsww, pmullw, paddw | 1552 FMA_INSTR pmacsww, pmullw, paddw |
| 1553 FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation |
| 1554 FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation |
| 1233 FMA_INSTR pmadcswd, pmaddwd, paddd | 1555 FMA_INSTR pmadcswd, pmaddwd, paddd |
| 1556 |
| 1557 ; convert FMA4 to FMA3 if possible |
| 1558 %macro FMA4_INSTR 4 |
| 1559 %macro %1 4-8 %1, %2, %3, %4 |
| 1560 %if cpuflag(fma4) |
| 1561 v%5 %1, %2, %3, %4 |
| 1562 %elifidn %1, %2 |
| 1563 v%6 %1, %4, %3 ; %1 = %1 * %3 + %4 |
| 1564 %elifidn %1, %3 |
| 1565 v%7 %1, %2, %4 ; %1 = %2 * %1 + %4 |
| 1566 %elifidn %1, %4 |
| 1567 v%8 %1, %2, %3 ; %1 = %2 * %3 + %1 |
| 1568 %else |
| 1569 %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported |
| 1570 %endif |
| 1571 %endmacro |
| 1572 %endmacro |
| 1573 |
| 1574 FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd |
| 1575 FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps |
| 1576 FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd |
| 1577 FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss |
| 1578 |
| 1579 FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd |
| 1580 FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps |
| 1581 FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd |
| 1582 FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps |
| 1583 |
| 1584 FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd |
| 1585 FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps |
| 1586 FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd |
| 1587 FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss |
| 1588 |
| 1589 FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd |
| 1590 FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps |
| 1591 FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd |
| 1592 FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss |
| 1593 |
| 1594 FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd |
| 1595 FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps |
| 1596 FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd |
| 1597 FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss |
| 1598 |
| 1599 ; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug |
| 1600 %if ARCH_X86_64 == 0 |
| 1601 %macro vpbroadcastq 2 |
| 1602 %if sizeof%1 == 16 |
| 1603 movddup %1, %2 |
| 1604 %else |
| 1605 vbroadcastsd %1, %2 |
| 1606 %endif |
| 1607 %endmacro |
| 1608 %endif |
| OLD | NEW |