OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; jsimdext.inc - common declarations |
| 3 ; |
| 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
| 5 ; Copyright 2010 D. R. Commander |
| 6 ; |
| 7 ; Based on |
| 8 ; x86 SIMD extension for IJG JPEG library - version 1.02 |
| 9 ; |
| 10 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
| 11 ; |
| 12 ; This software is provided 'as-is', without any express or implied |
| 13 ; warranty. In no event will the authors be held liable for any damages |
| 14 ; arising from the use of this software. |
| 15 ; |
| 16 ; Permission is granted to anyone to use this software for any purpose, |
| 17 ; including commercial applications, and to alter it and redistribute it |
| 18 ; freely, subject to the following restrictions: |
| 19 ; |
| 20 ; 1. The origin of this software must not be misrepresented; you must not |
| 21 ; claim that you wrote the original software. If you use this software |
| 22 ; in a product, an acknowledgment in the product documentation would be |
| 23 ; appreciated but is not required. |
| 24 ; 2. Altered source versions must be plainly marked as such, and must not be |
| 25 ; misrepresented as being the original software. |
| 26 ; 3. This notice may not be removed or altered from any source distribution. |
| 27 ; |
| 28 ; [TAB8] |
| 29 |
| 30 ; ========================================================================== |
| 31 ; System-dependent configurations |
| 32 |
| 33 %ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)-------- |
| 34 ; * Microsoft Visual C++ |
| 35 ; * MinGW (Minimalist GNU for Windows) |
| 36 ; * CygWin |
| 37 ; * LCC-Win32 |
| 38 |
| 39 ; -- segment definition -- |
| 40 ; |
| 41 %define SEG_TEXT .text align=16 public use32 class=CODE |
| 42 %define SEG_CONST .rdata align=16 public use32 class=CONST |
| 43 |
| 44 %elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- |
| 45 ; * Microsoft Visual C++ |
| 46 |
| 47 ; -- segment definition -- |
| 48 ; |
| 49 %define SEG_TEXT .text align=16 public use64 class=CODE |
| 50 %define SEG_CONST .rdata align=16 public use64 class=CONST |
| 51 %ifdef MSVC |
| 52 %define EXTN(name) name ; foo() -> foo |
| 53 %endif |
| 54 |
| 55 %elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- |
| 56 ; * Borland C++ (Win32) |
| 57 |
| 58 ; -- segment definition -- |
| 59 ; |
| 60 %define SEG_TEXT .text align=16 public use32 class=CODE |
| 61 %define SEG_CONST .data align=16 public use32 class=DATA |
| 62 |
| 63 %elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ |
| 64 ; * Linux |
| 65 ; * *BSD family Unix using elf format |
| 66 ; * Unix System V, including Solaris x86, UnixWare and SCO Unix |
| 67 |
| 68 ; mark stack as non-executable |
| 69 section .note.GNU-stack noalloc noexec nowrite progbits |
| 70 |
| 71 ; -- segment definition -- |
| 72 ; |
| 73 %ifdef __x86_64__ |
| 74 %define SEG_TEXT .text progbits align=16 |
| 75 %define SEG_CONST .rodata progbits align=16 |
| 76 %else |
| 77 %define SEG_TEXT .text progbits alloc exec nowrite align=16 |
| 78 %define SEG_CONST .rodata progbits alloc noexec nowrite align=16 |
| 79 %endif |
| 80 |
| 81 ; To make the code position-independent, append -DPIC to the commandline |
| 82 ; |
| 83 %define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC |
| 84 %define EXTN(name) name ; foo() -> foo |
| 85 |
| 86 %elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- |
| 87 ; * Older Linux using a.out format (nasm -f aout -DAOUT ...) |
| 88 ; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...) |
| 89 |
| 90 ; -- segment definition -- |
| 91 ; |
| 92 %define SEG_TEXT .text |
| 93 %define SEG_CONST .data |
| 94 |
| 95 ; To make the code position-independent, append -DPIC to the commandline |
| 96 ; |
| 97 %define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC |
| 98 |
| 99 %elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- |
| 100 ; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format) |
| 101 |
| 102 ; -- segment definition -- |
| 103 ; |
| 104 %define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why? |
| 105 %define SEG_CONST .rodata align=16 |
| 106 |
| 107 ; The generation of position-independent code (PIC) is the default on Darwin. |
| 108 ; |
| 109 %define PIC |
| 110 %define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing |
| 111 |
| 112 %else ; ----(Other case)---------------------- |
| 113 |
| 114 ; -- segment definition -- |
| 115 ; |
| 116 %define SEG_TEXT .text |
| 117 %define SEG_CONST .data |
| 118 |
| 119 %endif ; ---------------------------------------------- |
| 120 |
| 121 ; ========================================================================== |
| 122 |
| 123 ; -------------------------------------------------------------------------- |
| 124 ; Common types |
| 125 ; |
| 126 %ifdef __x86_64__ |
| 127 %define POINTER qword ; general pointer type |
| 128 %define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER) |
| 129 %define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT |
| 130 %else |
| 131 %define POINTER dword ; general pointer type |
| 132 %define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) |
| 133 %define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT |
| 134 %endif |
| 135 |
| 136 %define INT dword ; signed integer type |
| 137 %define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT) |
| 138 %define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT |
| 139 |
| 140 %define FP32 dword ; IEEE754 single |
| 141 %define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32) |
| 142 %define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT |
| 143 |
| 144 %define MMWORD qword ; int64 (MMX register) |
| 145 %define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD) |
| 146 %define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT |
| 147 |
| 148 ; NASM is buggy and doesn't properly handle operand sizes for SSE |
| 149 ; instructions, so for now we have to define XMMWORD as blank. |
| 150 %define XMMWORD ; int128 (SSE register) |
| 151 %define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD) |
| 152 %define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT |
| 153 |
| 154 ; Similar hacks for when we load a dword or MMWORD into an xmm# register |
| 155 %define XMM_DWORD |
| 156 %define XMM_MMWORD |
| 157 |
| 158 %define SIZEOF_BYTE 1 ; sizeof(BYTE) |
| 159 %define SIZEOF_WORD 2 ; sizeof(WORD) |
| 160 %define SIZEOF_DWORD 4 ; sizeof(DWORD) |
| 161 %define SIZEOF_QWORD 8 ; sizeof(QWORD) |
| 162 %define SIZEOF_OWORD 16 ; sizeof(OWORD) |
| 163 |
| 164 %define BYTE_BIT 8 ; CHAR_BIT in C |
| 165 %define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT |
| 166 %define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT |
| 167 %define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT |
| 168 %define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT |
| 169 |
| 170 ; -------------------------------------------------------------------------- |
| 171 ; External Symbol Name |
| 172 ; |
| 173 %ifndef EXTN |
| 174 %define EXTN(name) _ %+ name ; foo() -> _foo |
| 175 %endif |
| 176 |
| 177 ; -------------------------------------------------------------------------- |
| 178 ; Macros for position-independent code (PIC) support |
| 179 ; |
| 180 %ifndef GOT_SYMBOL |
| 181 %undef PIC |
| 182 %endif |
| 183 |
| 184 %ifdef PIC ; ------------------------------------------- |
| 185 |
| 186 %ifidn GOT_SYMBOL,_MACHO_PIC_ ; -------------------- |
| 187 |
| 188 ; At present, nasm doesn't seem to support PIC generation for Mach-O. |
| 189 ; The PIC support code below is a little tricky. |
| 190 |
| 191 SECTION SEG_CONST |
| 192 const_base: |
| 193 |
| 194 %define GOTOFF(got,sym) (got) + (sym) - const_base |
| 195 |
| 196 %imacro get_GOT 1 |
| 197 ; NOTE: this macro destroys ecx resister. |
| 198 call %%geteip |
| 199 add ecx, byte (%%ref - $) |
| 200 jmp short %%adjust |
| 201 %%geteip: |
| 202 mov ecx, POINTER [esp] |
| 203 ret |
| 204 %%adjust: |
| 205 push ebp |
| 206 xor ebp,ebp ; ebp = 0 |
| 207 %ifidni %1,ebx ; (%1 == ebx) |
| 208 ; db 0x8D,0x9C + jmp near const_base = |
| 209 ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) |
| 210 db 0x8D,0x9C ; 8D,9C |
| 211 jmp near const_base ; E9,(const_base-%%ref) |
| 212 %%ref: |
| 213 %else ; (%1 != ebx) |
| 214 ; db 0x8D,0x8C + jmp near const_base = |
| 215 ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) |
| 216 db 0x8D,0x8C ; 8D,8C |
| 217 jmp near const_base ; E9,(const_base-%%ref) |
| 218 %%ref: mov %1, ecx |
| 219 %endif ; (%1 == ebx) |
| 220 pop ebp |
| 221 %endmacro |
| 222 |
| 223 %else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- |
| 224 |
| 225 %define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff |
| 226 |
| 227 %imacro get_GOT 1 |
| 228 extern GOT_SYMBOL |
| 229 call %%geteip |
| 230 add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc |
| 231 jmp short %%done |
| 232 %%geteip: |
| 233 mov %1, POINTER [esp] |
| 234 ret |
| 235 %%done: |
| 236 %endmacro |
| 237 |
| 238 %endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- |
| 239 |
| 240 %imacro pushpic 1.nolist |
| 241 push %1 |
| 242 %endmacro |
| 243 %imacro poppic 1.nolist |
| 244 pop %1 |
| 245 %endmacro |
| 246 %imacro movpic 2.nolist |
| 247 mov %1,%2 |
| 248 %endmacro |
| 249 |
| 250 %else ; !PIC ----------------------------------------- |
| 251 |
| 252 %define GOTOFF(got,sym) (sym) |
| 253 |
| 254 %imacro get_GOT 1.nolist |
| 255 %endmacro |
| 256 %imacro pushpic 1.nolist |
| 257 %endmacro |
| 258 %imacro poppic 1.nolist |
| 259 %endmacro |
| 260 %imacro movpic 2.nolist |
| 261 %endmacro |
| 262 |
| 263 %endif ; PIC ----------------------------------------- |
| 264 |
| 265 ; -------------------------------------------------------------------------- |
| 266 ; Align the next instruction on {2,4,8,16,..}-byte boundary. |
| 267 ; ".balign n,,m" in GNU as |
| 268 ; |
| 269 %define MSKLE(x,y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16) |
| 270 %define FILLB(b,n) (($$-(b)) & ((n)-1)) |
| 271 |
| 272 %imacro alignx 1-2.nolist 0xFFFF |
| 273 %%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \ |
| 274 db 0x90 ; nop |
| 275 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \ |
| 276 db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000] |
| 277 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \ |
| 278 db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] |
| 279 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \ |
| 280 db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] |
| 281 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \ |
| 282 db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00] |
| 283 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \ |
| 284 db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00] |
| 285 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \ |
| 286 db 0x8B,0xED ; mov ebp,ebp |
| 287 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \ |
| 288 db 0x90 ; nop |
| 289 %endmacro |
| 290 |
| 291 ; Align the next data on {2,4,8,16,..}-byte boundary. |
| 292 ; |
| 293 %imacro alignz 1.nolist |
| 294 align %1, db 0 ; filling zeros |
| 295 %endmacro |
| 296 |
| 297 %ifdef __x86_64__ |
| 298 |
| 299 %ifdef WIN64 |
| 300 |
| 301 %imacro collect_args 0 |
| 302 push r10 |
| 303 push r11 |
| 304 push r12 |
| 305 push r13 |
| 306 push r14 |
| 307 push r15 |
| 308 mov r10, rcx |
| 309 mov r11, rdx |
| 310 mov r12, r8 |
| 311 mov r13, r9 |
| 312 mov r14, [rax+48] |
| 313 mov r15, [rax+56] |
| 314 push rsi |
| 315 push rdi |
| 316 sub rsp, SIZEOF_XMMWORD |
| 317 movlpd XMMWORD [rsp], xmm6 |
| 318 sub rsp, SIZEOF_XMMWORD |
| 319 movlpd XMMWORD [rsp], xmm7 |
| 320 %endmacro |
| 321 |
| 322 %imacro uncollect_args 0 |
| 323 movlpd xmm7, XMMWORD [rsp] |
| 324 add rsp, SIZEOF_XMMWORD |
| 325 movlpd xmm6, XMMWORD [rsp] |
| 326 add rsp, SIZEOF_XMMWORD |
| 327 pop rdi |
| 328 pop rsi |
| 329 pop r15 |
| 330 pop r14 |
| 331 pop r13 |
| 332 pop r12 |
| 333 pop r11 |
| 334 pop r10 |
| 335 %endmacro |
| 336 |
| 337 %else |
| 338 |
| 339 %imacro collect_args 0 |
| 340 push r10 |
| 341 push r11 |
| 342 push r12 |
| 343 push r13 |
| 344 push r14 |
| 345 push r15 |
| 346 mov r10, rdi |
| 347 mov r11, rsi |
| 348 mov r12, rdx |
| 349 mov r13, rcx |
| 350 mov r14, r8 |
| 351 mov r15, r9 |
| 352 %endmacro |
| 353 |
| 354 %imacro uncollect_args 0 |
| 355 pop r15 |
| 356 pop r14 |
| 357 pop r13 |
| 358 pop r12 |
| 359 pop r11 |
| 360 pop r10 |
| 361 %endmacro |
| 362 |
| 363 %endif |
| 364 |
| 365 %endif |
| 366 |
| 367 ; -------------------------------------------------------------------------- |
| 368 ; Defines picked up from the C headers |
| 369 ; |
| 370 %include "jsimdcfg.inc" |
| 371 |
| 372 ; -------------------------------------------------------------------------- |
OLD | NEW |