Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(554)

Side by Side Diff: source/libvpx/third_party/x86inc/x86inc.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;*****************************************************************************
2 ;* x86inc.asm: x264asm abstraction layer
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2012 x264 project
5 ;*
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Anton Mitrofanov <BugMaster@narod.ru>
8 ;* Jason Garrett-Glaser <darkshikari@gmail.com>
9 ;* Henrik Gramner <hengar-6@student.ltu.se>
10 ;*
11 ;* Permission to use, copy, modify, and/or distribute this software for any
12 ;* purpose with or without fee is hereby granted, provided that the above
13 ;* copyright notice and this permission notice appear in all copies.
14 ;*
15 ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16 ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17 ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18 ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20 ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21 ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22 ;*****************************************************************************
23
24 ; This is a header file for the x264ASM assembly language, which uses
25 ; NASM/YASM syntax combined with a large number of macros to provide easy
26 ; abstraction between different calling conventions (x86_32, win64, linux64).
27 ; It also has various other useful features to simplify writing the kind of
28 ; DSP functions that are most often used in x264.
29
30 ; Unlike the rest of x264, this file is available under an ISC license, as it
31 ; has significant usefulness outside of x264 and we want it to be available
32 ; to the largest audience possible. Of course, if you modify it for your own
33 ; purposes to add a new feature, we strongly encourage contributing a patch
34 ; as this feature might be useful for others as well. Send patches or ideas
35 ; to x264-devel@videolan.org .
36
37 %include "vpx_config.asm"
38
39 %define program_name vp9
40
41
42 %define UNIX64 0
43 %define WIN64 0
44 %if ARCH_X86_64
45 %ifidn __OUTPUT_FORMAT__,win32
46 %define WIN64 1
47 %elifidn __OUTPUT_FORMAT__,win64
48 %define WIN64 1
49 %elifidn __OUTPUT_FORMAT__,x64
50 %define WIN64 1
51 %else
52 %define UNIX64 1
53 %endif
54 %endif
55
56 %ifidn __OUTPUT_FORMAT__,elf32
57 %define mangle(x) x
58 %elifidn __OUTPUT_FORMAT__,elf64
59 %define mangle(x) x
60 %elifidn __OUTPUT_FORMAT__,elf
61 %define mangle(x) x
62 %elifidn __OUTPUT_FORMAT__,x64
63 %define mangle(x) x
64 %else
65 %define mangle(x) _ %+ x
66 %endif
67
68 ; FIXME: All of the 64bit asm functions that take a stride as an argument
69 ; via register, assume that the high dword of that register is filled with 0.
70 ; This is true in practice (since we never do any 64bit arithmetic on strides,
71 ; and x264's strides are all positive), but is not guaranteed by the ABI.
72
73 ; Name of the .rodata section.
74 ; Kludge: Something on OS X fails to align .rodata even given an align attribute ,
75 ; so use a different read-only section.
76 %macro SECTION_RODATA 0-1 16
77 %ifidn __OUTPUT_FORMAT__,macho64
78 SECTION .text align=%1
79 %elifidn __OUTPUT_FORMAT__,macho
80 SECTION .text align=%1
81 fakegot:
82 %elifidn __OUTPUT_FORMAT__,aout
83 section .text
84 %else
85 SECTION .rodata align=%1
86 %endif
87 %endmacro
88
89 ; aout does not support align=
90 %macro SECTION_TEXT 0-1 16
91 %ifidn __OUTPUT_FORMAT__,aout
92 SECTION .text
93 %else
94 SECTION .text align=%1
95 %endif
96 %endmacro
97
98 %if WIN64
99 %define PIC
100 %elifidn __OUTPUT_FORMAT__,macho64
101 %define PIC
102 %elif ARCH_X86_64 == 0
103 ; x86_32 doesn't require PIC.
104 ; Some distros prefer shared objects to be PIC, but nothing breaks if
105 ; the code contains a few textrels, so we'll skip that complexity.
106 %undef PIC
107 %elif CONFIG_PIC
108 %define PIC
109 %endif
110 %ifdef PIC
111 default rel
112 %endif
113
114 ; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
115 CPU amdnop
116
117 ; Macros to eliminate most code duplication between x86_32 and x86_64:
118 ; Currently this works only for leaf functions which load all their arguments
119 ; into registers at the start, and make no other use of the stack. Luckily that
120 ; covers most of x264's asm.
121
122 ; PROLOGUE:
123 ; %1 = number of arguments. loads them from stack if needed.
124 ; %2 = number of registers used. pushes callee-saved regs if needed.
125 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
126 ; %4 = list of names to define to registers
127 ; PROLOGUE can also be invoked by adding the same options to cglobal
128
129 ; e.g.
130 ; cglobal foo, 2,3,0, dst, src, tmp
131 ; declares a function (foo), taking two args (dst and src) and one local variabl e (tmp)
132
133 ; TODO Some functions can use some args directly from the stack. If they're the
134 ; last args then you can just not declare them, but if they're in the middle
135 ; we need more flexible macro.
136
137 ; RET:
138 ; Pops anything that was pushed by PROLOGUE, and returns.
139
140 ; REP_RET:
141 ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
142 ; which are slow when a normal ret follows a branch.
143
144 ; registers:
145 ; rN and rNq are the native-size register holding function argument N
146 ; rNd, rNw, rNb are dword, word, and byte size
147 ; rNm is the original location of arg N (a register or on the stack), dword
148 ; rNmp is native size
149
150 %macro DECLARE_REG 5-6
151 %define r%1q %2
152 %define r%1d %3
153 %define r%1w %4
154 %define r%1b %5
155 %if %0 == 5
156 %define r%1m %3
157 %define r%1mp %2
158 %elif ARCH_X86_64 ; memory
159 %define r%1m [rsp + stack_offset + %6]
160 %define r%1mp qword r %+ %1m
161 %else
162 %define r%1m [esp + stack_offset + %6]
163 %define r%1mp dword r %+ %1m
164 %endif
165 %define r%1 %2
166 %endmacro
167
168 %macro DECLARE_REG_SIZE 2
169 %define r%1q r%1
170 %define e%1q r%1
171 %define r%1d e%1
172 %define e%1d e%1
173 %define r%1w %1
174 %define e%1w %1
175 %define r%1b %2
176 %define e%1b %2
177 %if ARCH_X86_64 == 0
178 %define r%1 e%1
179 %endif
180 %endmacro
181
182 DECLARE_REG_SIZE ax, al
183 DECLARE_REG_SIZE bx, bl
184 DECLARE_REG_SIZE cx, cl
185 DECLARE_REG_SIZE dx, dl
186 DECLARE_REG_SIZE si, sil
187 DECLARE_REG_SIZE di, dil
188 DECLARE_REG_SIZE bp, bpl
189
190 ; t# defines for when per-arch register allocation is more complex than just fun ction arguments
191
192 %macro DECLARE_REG_TMP 1-*
193 %assign %%i 0
194 %rep %0
195 CAT_XDEFINE t, %%i, r%1
196 %assign %%i %%i+1
197 %rotate 1
198 %endrep
199 %endmacro
200
201 %macro DECLARE_REG_TMP_SIZE 0-*
202 %rep %0
203 %define t%1q t%1 %+ q
204 %define t%1d t%1 %+ d
205 %define t%1w t%1 %+ w
206 %define t%1b t%1 %+ b
207 %rotate 1
208 %endrep
209 %endmacro
210
211 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
212
213 %if ARCH_X86_64
214 %define gprsize 8
215 %else
216 %define gprsize 4
217 %endif
218
219 %macro PUSH 1
220 push %1
221 %assign stack_offset stack_offset+gprsize
222 %endmacro
223
224 %macro POP 1
225 pop %1
226 %assign stack_offset stack_offset-gprsize
227 %endmacro
228
229 %macro PUSH_IF_USED 1-*
230 %rep %0
231 %if %1 < regs_used
232 PUSH r%1
233 %endif
234 %rotate 1
235 %endrep
236 %endmacro
237
238 %macro POP_IF_USED 1-*
239 %rep %0
240 %if %1 < regs_used
241 pop r%1
242 %endif
243 %rotate 1
244 %endrep
245 %endmacro
246
247 %macro LOAD_IF_USED 1-*
248 %rep %0
249 %if %1 < num_args
250 mov r%1, r %+ %1 %+ mp
251 %endif
252 %rotate 1
253 %endrep
254 %endmacro
255
256 %macro SUB 2
257 sub %1, %2
258 %ifidn %1, rsp
259 %assign stack_offset stack_offset+(%2)
260 %endif
261 %endmacro
262
263 %macro ADD 2
264 add %1, %2
265 %ifidn %1, rsp
266 %assign stack_offset stack_offset-(%2)
267 %endif
268 %endmacro
269
270 %macro movifnidn 2
271 %ifnidn %1, %2
272 mov %1, %2
273 %endif
274 %endmacro
275
276 %macro movsxdifnidn 2
277 %ifnidn %1, %2
278 movsxd %1, %2
279 %endif
280 %endmacro
281
282 %macro ASSERT 1
283 %if (%1) == 0
284 %error assert failed
285 %endif
286 %endmacro
287
288 %macro DEFINE_ARGS 0-*
289 %ifdef n_arg_names
290 %assign %%i 0
291 %rep n_arg_names
292 CAT_UNDEF arg_name %+ %%i, q
293 CAT_UNDEF arg_name %+ %%i, d
294 CAT_UNDEF arg_name %+ %%i, w
295 CAT_UNDEF arg_name %+ %%i, b
296 CAT_UNDEF arg_name %+ %%i, m
297 CAT_UNDEF arg_name %+ %%i, mp
298 CAT_UNDEF arg_name, %%i
299 %assign %%i %%i+1
300 %endrep
301 %endif
302
303 %xdefine %%stack_offset stack_offset
304 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
305 %assign %%i 0
306 %rep %0
307 %xdefine %1q r %+ %%i %+ q
308 %xdefine %1d r %+ %%i %+ d
309 %xdefine %1w r %+ %%i %+ w
310 %xdefine %1b r %+ %%i %+ b
311 %xdefine %1m r %+ %%i %+ m
312 %xdefine %1mp r %+ %%i %+ mp
313 CAT_XDEFINE arg_name, %%i, %1
314 %assign %%i %%i+1
315 %rotate 1
316 %endrep
317 %xdefine stack_offset %%stack_offset
318 %assign n_arg_names %0
319 %endmacro
320
321 %if WIN64 ; Windows x64 ;=================================================
322
323 DECLARE_REG 0, rcx, ecx, cx, cl
324 DECLARE_REG 1, rdx, edx, dx, dl
325 DECLARE_REG 2, R8, R8D, R8W, R8B
326 DECLARE_REG 3, R9, R9D, R9W, R9B
327 DECLARE_REG 4, R10, R10D, R10W, R10B, 40
328 DECLARE_REG 5, R11, R11D, R11W, R11B, 48
329 DECLARE_REG 6, rax, eax, ax, al, 56
330 DECLARE_REG 7, rdi, edi, di, dil, 64
331 DECLARE_REG 8, rsi, esi, si, sil, 72
332 DECLARE_REG 9, rbx, ebx, bx, bl, 80
333 DECLARE_REG 10, rbp, ebp, bp, bpl, 88
334 DECLARE_REG 11, R12, R12D, R12W, R12B, 96
335 DECLARE_REG 12, R13, R13D, R13W, R13B, 104
336 DECLARE_REG 13, R14, R14D, R14W, R14B, 112
337 DECLARE_REG 14, R15, R15D, R15W, R15B, 120
338
339 %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
340 %assign num_args %1
341 %assign regs_used %2
342 ASSERT regs_used >= num_args
343 ASSERT regs_used <= 15
344 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
345 %if mmsize == 8
346 %assign xmm_regs_used 0
347 %else
348 WIN64_SPILL_XMM %3
349 %endif
350 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
351 DEFINE_ARGS %4
352 %endmacro
353
354 %macro WIN64_SPILL_XMM 1
355 %assign xmm_regs_used %1
356 ASSERT xmm_regs_used <= 16
357 %if xmm_regs_used > 6
358 SUB rsp, (xmm_regs_used-6)*16+16
359 %assign %%i xmm_regs_used
360 %rep (xmm_regs_used-6)
361 %assign %%i %%i-1
362 movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
363 %endrep
364 %endif
365 %endmacro
366
367 %macro WIN64_RESTORE_XMM_INTERNAL 1
368 %if xmm_regs_used > 6
369 %assign %%i xmm_regs_used
370 %rep (xmm_regs_used-6)
371 %assign %%i %%i-1
372 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
373 %endrep
374 add %1, (xmm_regs_used-6)*16+16
375 %endif
376 %endmacro
377
378 %macro WIN64_RESTORE_XMM 1
379 WIN64_RESTORE_XMM_INTERNAL %1
380 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
381 %assign xmm_regs_used 0
382 %endmacro
383
384 %macro RET 0
385 WIN64_RESTORE_XMM_INTERNAL rsp
386 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
387 ret
388 %endmacro
389
390 %macro REP_RET 0
391 %if regs_used > 7 || xmm_regs_used > 6
392 RET
393 %else
394 rep ret
395 %endif
396 %endmacro
397
398 %elif ARCH_X86_64 ; *nix x64 ;=============================================
399
400 DECLARE_REG 0, rdi, edi, di, dil
401 DECLARE_REG 1, rsi, esi, si, sil
402 DECLARE_REG 2, rdx, edx, dx, dl
403 DECLARE_REG 3, rcx, ecx, cx, cl
404 DECLARE_REG 4, R8, R8D, R8W, R8B
405 DECLARE_REG 5, R9, R9D, R9W, R9B
406 DECLARE_REG 6, rax, eax, ax, al, 8
407 DECLARE_REG 7, R10, R10D, R10W, R10B, 16
408 DECLARE_REG 8, R11, R11D, R11W, R11B, 24
409 DECLARE_REG 9, rbx, ebx, bx, bl, 32
410 DECLARE_REG 10, rbp, ebp, bp, bpl, 40
411 DECLARE_REG 11, R12, R12D, R12W, R12B, 48
412 DECLARE_REG 12, R13, R13D, R13W, R13B, 56
413 DECLARE_REG 13, R14, R14D, R14W, R14B, 64
414 DECLARE_REG 14, R15, R15D, R15W, R15B, 72
415
416 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
417 %assign num_args %1
418 %assign regs_used %2
419 ASSERT regs_used >= num_args
420 ASSERT regs_used <= 15
421 PUSH_IF_USED 9, 10, 11, 12, 13, 14
422 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
423 DEFINE_ARGS %4
424 %endmacro
425
426 %macro RET 0
427 POP_IF_USED 14, 13, 12, 11, 10, 9
428 ret
429 %endmacro
430
431 %macro REP_RET 0
432 %if regs_used > 9
433 RET
434 %else
435 rep ret
436 %endif
437 %endmacro
438
439 %else ; X86_32 ;==============================================================
440
441 DECLARE_REG 0, eax, eax, ax, al, 4
442 DECLARE_REG 1, ecx, ecx, cx, cl, 8
443 DECLARE_REG 2, edx, edx, dx, dl, 12
444 DECLARE_REG 3, ebx, ebx, bx, bl, 16
445 DECLARE_REG 4, esi, esi, si, null, 20
446 DECLARE_REG 5, edi, edi, di, null, 24
447 DECLARE_REG 6, ebp, ebp, bp, null, 28
448 %define rsp esp
449
450 %macro DECLARE_ARG 1-*
451 %rep %0
452 %define r%1m [esp + stack_offset + 4*%1 + 4]
453 %define r%1mp dword r%1m
454 %rotate 1
455 %endrep
456 %endmacro
457
458 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
459
460 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
461 %assign num_args %1
462 %assign regs_used %2
463 %if regs_used > 7
464 %assign regs_used 7
465 %endif
466 ASSERT regs_used >= num_args
467 PUSH_IF_USED 3, 4, 5, 6
468 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
469 DEFINE_ARGS %4
470 %endmacro
471
472 %macro RET 0
473 POP_IF_USED 6, 5, 4, 3
474 ret
475 %endmacro
476
477 %macro REP_RET 0
478 %if regs_used > 3
479 RET
480 %else
481 rep ret
482 %endif
483 %endmacro
484
485 %endif ;======================================================================
486
487 %if WIN64 == 0
488 %macro WIN64_SPILL_XMM 1
489 %endmacro
490 %macro WIN64_RESTORE_XMM 1
491 %endmacro
492 %endif
493
494 ;=============================================================================
495 ; arch-independent part
496 ;=============================================================================
497
498 %assign function_align 16
499
500 ; Begin a function.
501 ; Applies any symbol mangling needed for C linkage, and sets up a define such th at
502 ; subsequent uses of the function name automatically refer to the mangled versio n.
503 ; Appends cpuflags to the function name if cpuflags has been specified.
504 %macro cglobal 1-2+ ; name, [PROLOGUE args]
505 %if %0 == 1
506 cglobal_internal %1 %+ SUFFIX
507 %else
508 cglobal_internal %1 %+ SUFFIX, %2
509 %endif
510 %endmacro
511 %macro cglobal_internal 1-2+
512 %ifndef cglobaled_%1
513 %xdefine %1 mangle(program_name %+ _ %+ %1)
514 %xdefine %1.skip_prologue %1 %+ .skip_prologue
515 CAT_XDEFINE cglobaled_, %1, 1
516 %endif
517 %xdefine current_function %1
518 %ifidn __OUTPUT_FORMAT__,elf
519 global %1:function hidden
520 %elifidn __OUTPUT_FORMAT__,elf32
521 global %1:function hidden
522 %elifidn __OUTPUT_FORMAT__,elf64
523 global %1:function hidden
524 %else
525 global %1
526 %endif
527 align function_align
528 %1:
529 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nic er
530 %assign stack_offset 0
531 %if %0 > 1
532 PROLOGUE %2
533 %endif
534 %endmacro
535
536 %macro cextern 1
537 %xdefine %1 mangle(program_name %+ _ %+ %1)
538 CAT_XDEFINE cglobaled_, %1, 1
539 extern %1
540 %endmacro
541
542 ; like cextern, but without the prefix
543 %macro cextern_naked 1
544 %xdefine %1 mangle(%1)
545 CAT_XDEFINE cglobaled_, %1, 1
546 extern %1
547 %endmacro
548
549 %macro const 2+
550 %xdefine %1 mangle(program_name %+ _ %+ %1)
551 global %1
552 %1: %2
553 %endmacro
554
555 ; This is needed for ELF, otherwise the GNU linker assumes the stack is
556 ; executable by default.
557 %ifidn __OUTPUT_FORMAT__,elf
558 SECTION .note.GNU-stack noalloc noexec nowrite progbits
559 %elifidn __OUTPUT_FORMAT__,elf32
560 SECTION .note.GNU-stack noalloc noexec nowrite progbits
561 %elifidn __OUTPUT_FORMAT__,elf64
562 SECTION .note.GNU-stack noalloc noexec nowrite progbits
563 %endif
564
565 ; cpuflags
566
567 %assign cpuflags_mmx (1<<0)
568 %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
569 %assign cpuflags_3dnow (1<<2) | cpuflags_mmx
570 %assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow
571 %assign cpuflags_sse (1<<4) | cpuflags_mmx2
572 %assign cpuflags_sse2 (1<<5) | cpuflags_sse
573 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
574 %assign cpuflags_sse3 (1<<7) | cpuflags_sse2
575 %assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
576 %assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
577 %assign cpuflags_sse42 (1<<10)| cpuflags_sse4
578 %assign cpuflags_avx (1<<11)| cpuflags_sse42
579 %assign cpuflags_xop (1<<12)| cpuflags_avx
580 %assign cpuflags_fma4 (1<<13)| cpuflags_avx
581
582 %assign cpuflags_cache32 (1<<16)
583 %assign cpuflags_cache64 (1<<17)
584 %assign cpuflags_slowctz (1<<18)
585 %assign cpuflags_lzcnt (1<<19)
586 %assign cpuflags_misalign (1<<20)
587 %assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
588 %assign cpuflags_atom (1<<22)
589
590 %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
591 %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
592
593 ; Takes up to 2 cpuflags from the above list.
594 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the speci fied cpu.
595 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_M MX &co.
596 %macro INIT_CPUFLAGS 0-2
597 %if %0 >= 1
598 %xdefine cpuname %1
599 %assign cpuflags cpuflags_%1
600 %if %0 >= 2
601 %xdefine cpuname %1_%2
602 %assign cpuflags cpuflags | cpuflags_%2
603 %endif
604 %xdefine SUFFIX _ %+ cpuname
605 %if cpuflag(avx)
606 %assign avx_enabled 1
607 %endif
608 %if mmsize == 16 && notcpuflag(sse2)
609 %define mova movaps
610 %define movu movups
611 %define movnta movntps
612 %endif
613 %if cpuflag(aligned)
614 %define movu mova
615 %elifidn %1, sse3
616 %define movu lddqu
617 %endif
618 %else
619 %xdefine SUFFIX
620 %undef cpuname
621 %undef cpuflags
622 %endif
623 %endmacro
624
625 ; merge mmx and sse*
626
627 %macro CAT_XDEFINE 3
628 %xdefine %1%2 %3
629 %endmacro
630
631 %macro CAT_UNDEF 2
632 %undef %1%2
633 %endmacro
634
635 %macro INIT_MMX 0-1+
636 %assign avx_enabled 0
637 %define RESET_MM_PERMUTATION INIT_MMX %1
638 %define mmsize 8
639 %define num_mmregs 8
640 %define mova movq
641 %define movu movq
642 %define movh movd
643 %define movnta movntq
644 %assign %%i 0
645 %rep 8
646 CAT_XDEFINE m, %%i, mm %+ %%i
647 CAT_XDEFINE nmm, %%i, %%i
648 %assign %%i %%i+1
649 %endrep
650 %rep 8
651 CAT_UNDEF m, %%i
652 CAT_UNDEF nmm, %%i
653 %assign %%i %%i+1
654 %endrep
655 INIT_CPUFLAGS %1
656 %endmacro
657
658 %macro INIT_XMM 0-1+
659 %assign avx_enabled 0
660 %define RESET_MM_PERMUTATION INIT_XMM %1
661 %define mmsize 16
662 %define num_mmregs 8
663 %if ARCH_X86_64
664 %define num_mmregs 16
665 %endif
666 %define mova movdqa
667 %define movu movdqu
668 %define movh movq
669 %define movnta movntdq
670 %assign %%i 0
671 %rep num_mmregs
672 CAT_XDEFINE m, %%i, xmm %+ %%i
673 CAT_XDEFINE nxmm, %%i, %%i
674 %assign %%i %%i+1
675 %endrep
676 INIT_CPUFLAGS %1
677 %endmacro
678
679 ; FIXME: INIT_AVX can be replaced by INIT_XMM avx
680 %macro INIT_AVX 0
681 INIT_XMM
682 %assign avx_enabled 1
683 %define PALIGNR PALIGNR_SSSE3
684 %define RESET_MM_PERMUTATION INIT_AVX
685 %endmacro
686
687 %macro INIT_YMM 0-1+
688 %assign avx_enabled 1
689 %define RESET_MM_PERMUTATION INIT_YMM %1
690 %define mmsize 32
691 %define num_mmregs 8
692 %if ARCH_X86_64
693 %define num_mmregs 16
694 %endif
695 %define mova vmovaps
696 %define movu vmovups
697 %undef movh
698 %define movnta vmovntps
699 %assign %%i 0
700 %rep num_mmregs
701 CAT_XDEFINE m, %%i, ymm %+ %%i
702 CAT_XDEFINE nymm, %%i, %%i
703 %assign %%i %%i+1
704 %endrep
705 INIT_CPUFLAGS %1
706 %endmacro
707
708 INIT_XMM
709
710 ; I often want to use macros that permute their arguments. e.g. there's no
711 ; efficient way to implement butterfly or transpose or dct without swapping some
712 ; arguments.
713 ;
714 ; I would like to not have to manually keep track of the permutations:
715 ; If I insert a permutation in the middle of a function, it should automatically
716 ; change everything that follows. For more complex macros I may also have multip le
717 ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutati ons.
718 ;
719 ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
720 ; permutes its arguments. It's equivalent to exchanging the contents of the
721 ; registers, except that this way you exchange the register names instead, so it
722 ; doesn't cost any cycles.
723
724 %macro PERMUTE 2-* ; takes a list of pairs to swap
725 %rep %0/2
726 %xdefine tmp%2 m%2
727 %xdefine ntmp%2 nm%2
728 %rotate 2
729 %endrep
730 %rep %0/2
731 %xdefine m%1 tmp%2
732 %xdefine nm%1 ntmp%2
733 %undef tmp%2
734 %undef ntmp%2
735 %rotate 2
736 %endrep
737 %endmacro
738
739 %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
740 %rep %0-1
741 %ifdef m%1
742 %xdefine tmp m%1
743 %xdefine m%1 m%2
744 %xdefine m%2 tmp
745 CAT_XDEFINE n, m%1, %1
746 CAT_XDEFINE n, m%2, %2
747 %else
748 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the origina l numbers here.
749 ; Be careful using this mode in nested macros though, as in some cases there may be
750 ; other copies of m# that have already been dereferenced and don't get updat ed correctly.
751 %xdefine %%n1 n %+ %1
752 %xdefine %%n2 n %+ %2
753 %xdefine tmp m %+ %%n1
754 CAT_XDEFINE m, %%n1, m %+ %%n2
755 CAT_XDEFINE m, %%n2, tmp
756 CAT_XDEFINE n, m %+ %%n1, %%n1
757 CAT_XDEFINE n, m %+ %%n2, %%n2
758 %endif
759 %undef tmp
760 %rotate 1
761 %endrep
762 %endmacro
763
764 ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
765 ; calls to that function will automatically load the permutation, so values can
766 ; be returned in mmregs.
767 %macro SAVE_MM_PERMUTATION 0-1
768 %if %0
769 %xdefine %%f %1_m
770 %else
771 %xdefine %%f current_function %+ _m
772 %endif
773 %assign %%i 0
774 %rep num_mmregs
775 CAT_XDEFINE %%f, %%i, m %+ %%i
776 %assign %%i %%i+1
777 %endrep
778 %endmacro
779
780 %macro LOAD_MM_PERMUTATION 1 ; name to load from
781 %ifdef %1_m0
782 %assign %%i 0
783 %rep num_mmregs
784 CAT_XDEFINE m, %%i, %1_m %+ %%i
785 CAT_XDEFINE n, m %+ %%i, %%i
786 %assign %%i %%i+1
787 %endrep
788 %endif
789 %endmacro
790
791 ; Append cpuflags to the callee's name iff the appended name is known and the pl ain name isn't
792 %macro call 1
793 call_internal %1, %1 %+ SUFFIX
794 %endmacro
795 %macro call_internal 2
796 %xdefine %%i %1
797 %ifndef cglobaled_%1
798 %ifdef cglobaled_%2
799 %xdefine %%i %2
800 %endif
801 %endif
802 call %%i
803 LOAD_MM_PERMUTATION %%i
804 %endmacro
805
806 ; Substitutions that reduce instruction size but are functionally equivalent
807 %macro add 2
808 %ifnum %2
809 %if %2==128
810 sub %1, -128
811 %else
812 add %1, %2
813 %endif
814 %else
815 add %1, %2
816 %endif
817 %endmacro
818
819 %macro sub 2
820 %ifnum %2
821 %if %2==128
822 add %1, -128
823 %else
824 sub %1, %2
825 %endif
826 %else
827 sub %1, %2
828 %endif
829 %endmacro
830
831 ;=============================================================================
832 ; AVX abstraction layer
833 ;=============================================================================
834
835 %assign i 0
836 %rep 16
837 %if i < 8
838 CAT_XDEFINE sizeofmm, i, 8
839 %endif
840 CAT_XDEFINE sizeofxmm, i, 16
841 CAT_XDEFINE sizeofymm, i, 32
842 %assign i i+1
843 %endrep
844 %undef i
845
846 ;%1 == instruction
847 ;%2 == 1 if float, 0 if int
848 ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
849 ;%4 == number of operands given
850 ;%5+: operands
851 %macro RUN_AVX_INSTR 6-7+
852 %ifid %5
853 %define %%size sizeof%5
854 %else
855 %define %%size mmsize
856 %endif
857 %if %%size==32
858 %if %0 >= 7
859 v%1 %5, %6, %7
860 %else
861 v%1 %5, %6
862 %endif
863 %else
864 %if %%size==8
865 %define %%regmov movq
866 %elif %2
867 %define %%regmov movaps
868 %else
869 %define %%regmov movdqa
870 %endif
871
872 %if %4>=3+%3
873 %ifnidn %5, %6
874 %if avx_enabled && sizeof%5==16
875 v%1 %5, %6, %7
876 %else
877 %%regmov %5, %6
878 %1 %5, %7
879 %endif
880 %else
881 %1 %5, %7
882 %endif
883 %elif %3
884 %1 %5, %6, %7
885 %else
886 %1 %5, %6
887 %endif
888 %endif
889 %endmacro
890
891 ; 3arg AVX ops with a memory arg can only have it in src2,
892 ; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
893 ; So, if the op is symmetric and the wrong one is memory, swap them.
894 %macro RUN_AVX_INSTR1 8
895 %assign %%swap 0
896 %if avx_enabled
897 %ifnid %6
898 %assign %%swap 1
899 %endif
900 %elifnidn %5, %6
901 %ifnid %7
902 %assign %%swap 1
903 %endif
904 %endif
905 %if %%swap && %3 == 0 && %8 == 1
906 RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
907 %else
908 RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
909 %endif
910 %endmacro
911
912 ;%1 == instruction
913 ;%2 == 1 if float, 0 if int
914 ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm)
915 ;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
916 %macro AVX_INSTR 4
917 %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
918 %ifidn %3, fnord
919 RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
920 %elifidn %4, fnord
921 RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
922 %elifidn %5, fnord
923 RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
924 %else
925 RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
926 %endif
927 %endmacro
928 %endmacro
929
930 AVX_INSTR addpd, 1, 0, 1
931 AVX_INSTR addps, 1, 0, 1
932 AVX_INSTR addsd, 1, 0, 1
933 AVX_INSTR addss, 1, 0, 1
934 AVX_INSTR addsubpd, 1, 0, 0
935 AVX_INSTR addsubps, 1, 0, 0
936 AVX_INSTR andpd, 1, 0, 1
937 AVX_INSTR andps, 1, 0, 1
938 AVX_INSTR andnpd, 1, 0, 0
939 AVX_INSTR andnps, 1, 0, 0
940 AVX_INSTR blendpd, 1, 0, 0
941 AVX_INSTR blendps, 1, 0, 0
942 AVX_INSTR blendvpd, 1, 0, 0
943 AVX_INSTR blendvps, 1, 0, 0
944 AVX_INSTR cmppd, 1, 0, 0
945 AVX_INSTR cmpps, 1, 0, 0
946 AVX_INSTR cmpsd, 1, 0, 0
947 AVX_INSTR cmpss, 1, 0, 0
948 AVX_INSTR cvtdq2ps, 1, 0, 0
949 AVX_INSTR cvtps2dq, 1, 0, 0
950 AVX_INSTR divpd, 1, 0, 0
951 AVX_INSTR divps, 1, 0, 0
952 AVX_INSTR divsd, 1, 0, 0
953 AVX_INSTR divss, 1, 0, 0
954 AVX_INSTR dppd, 1, 1, 0
955 AVX_INSTR dpps, 1, 1, 0
956 AVX_INSTR haddpd, 1, 0, 0
957 AVX_INSTR haddps, 1, 0, 0
958 AVX_INSTR hsubpd, 1, 0, 0
959 AVX_INSTR hsubps, 1, 0, 0
960 AVX_INSTR maxpd, 1, 0, 1
961 AVX_INSTR maxps, 1, 0, 1
962 AVX_INSTR maxsd, 1, 0, 1
963 AVX_INSTR maxss, 1, 0, 1
964 AVX_INSTR minpd, 1, 0, 1
965 AVX_INSTR minps, 1, 0, 1
966 AVX_INSTR minsd, 1, 0, 1
967 AVX_INSTR minss, 1, 0, 1
968 AVX_INSTR movhlps, 1, 0, 0
969 AVX_INSTR movlhps, 1, 0, 0
970 AVX_INSTR movsd, 1, 0, 0
971 AVX_INSTR movss, 1, 0, 0
972 AVX_INSTR mpsadbw, 0, 1, 0
973 AVX_INSTR mulpd, 1, 0, 1
974 AVX_INSTR mulps, 1, 0, 1
975 AVX_INSTR mulsd, 1, 0, 1
976 AVX_INSTR mulss, 1, 0, 1
977 AVX_INSTR orpd, 1, 0, 1
978 AVX_INSTR orps, 1, 0, 1
979 AVX_INSTR packsswb, 0, 0, 0
980 AVX_INSTR packssdw, 0, 0, 0
981 AVX_INSTR packuswb, 0, 0, 0
982 AVX_INSTR packusdw, 0, 0, 0
983 AVX_INSTR paddb, 0, 0, 1
984 AVX_INSTR paddw, 0, 0, 1
985 AVX_INSTR paddd, 0, 0, 1
986 AVX_INSTR paddq, 0, 0, 1
987 AVX_INSTR paddsb, 0, 0, 1
988 AVX_INSTR paddsw, 0, 0, 1
989 AVX_INSTR paddusb, 0, 0, 1
990 AVX_INSTR paddusw, 0, 0, 1
991 AVX_INSTR palignr, 0, 1, 0
992 AVX_INSTR pand, 0, 0, 1
993 AVX_INSTR pandn, 0, 0, 0
994 AVX_INSTR pavgb, 0, 0, 1
995 AVX_INSTR pavgw, 0, 0, 1
996 AVX_INSTR pblendvb, 0, 0, 0
997 AVX_INSTR pblendw, 0, 1, 0
998 AVX_INSTR pcmpestri, 0, 0, 0
999 AVX_INSTR pcmpestrm, 0, 0, 0
1000 AVX_INSTR pcmpistri, 0, 0, 0
1001 AVX_INSTR pcmpistrm, 0, 0, 0
1002 AVX_INSTR pcmpeqb, 0, 0, 1
1003 AVX_INSTR pcmpeqw, 0, 0, 1
1004 AVX_INSTR pcmpeqd, 0, 0, 1
1005 AVX_INSTR pcmpeqq, 0, 0, 1
1006 AVX_INSTR pcmpgtb, 0, 0, 0
1007 AVX_INSTR pcmpgtw, 0, 0, 0
1008 AVX_INSTR pcmpgtd, 0, 0, 0
1009 AVX_INSTR pcmpgtq, 0, 0, 0
1010 AVX_INSTR phaddw, 0, 0, 0
1011 AVX_INSTR phaddd, 0, 0, 0
1012 AVX_INSTR phaddsw, 0, 0, 0
1013 AVX_INSTR phsubw, 0, 0, 0
1014 AVX_INSTR phsubd, 0, 0, 0
1015 AVX_INSTR phsubsw, 0, 0, 0
1016 AVX_INSTR pmaddwd, 0, 0, 1
1017 AVX_INSTR pmaddubsw, 0, 0, 0
1018 AVX_INSTR pmaxsb, 0, 0, 1
1019 AVX_INSTR pmaxsw, 0, 0, 1
1020 AVX_INSTR pmaxsd, 0, 0, 1
1021 AVX_INSTR pmaxub, 0, 0, 1
1022 AVX_INSTR pmaxuw, 0, 0, 1
1023 AVX_INSTR pmaxud, 0, 0, 1
1024 AVX_INSTR pminsb, 0, 0, 1
1025 AVX_INSTR pminsw, 0, 0, 1
1026 AVX_INSTR pminsd, 0, 0, 1
1027 AVX_INSTR pminub, 0, 0, 1
1028 AVX_INSTR pminuw, 0, 0, 1
1029 AVX_INSTR pminud, 0, 0, 1
1030 AVX_INSTR pmulhuw, 0, 0, 1
1031 AVX_INSTR pmulhrsw, 0, 0, 1
1032 AVX_INSTR pmulhw, 0, 0, 1
1033 AVX_INSTR pmullw, 0, 0, 1
1034 AVX_INSTR pmulld, 0, 0, 1
1035 AVX_INSTR pmuludq, 0, 0, 1
1036 AVX_INSTR pmuldq, 0, 0, 1
1037 AVX_INSTR por, 0, 0, 1
1038 AVX_INSTR psadbw, 0, 0, 1
1039 AVX_INSTR pshufb, 0, 0, 0
1040 AVX_INSTR psignb, 0, 0, 0
1041 AVX_INSTR psignw, 0, 0, 0
1042 AVX_INSTR psignd, 0, 0, 0
1043 AVX_INSTR psllw, 0, 0, 0
1044 AVX_INSTR pslld, 0, 0, 0
1045 AVX_INSTR psllq, 0, 0, 0
1046 AVX_INSTR pslldq, 0, 0, 0
1047 AVX_INSTR psraw, 0, 0, 0
1048 AVX_INSTR psrad, 0, 0, 0
1049 AVX_INSTR psrlw, 0, 0, 0
1050 AVX_INSTR psrld, 0, 0, 0
1051 AVX_INSTR psrlq, 0, 0, 0
1052 AVX_INSTR psrldq, 0, 0, 0
1053 AVX_INSTR psubb, 0, 0, 0
1054 AVX_INSTR psubw, 0, 0, 0
1055 AVX_INSTR psubd, 0, 0, 0
1056 AVX_INSTR psubq, 0, 0, 0
1057 AVX_INSTR psubsb, 0, 0, 0
1058 AVX_INSTR psubsw, 0, 0, 0
1059 AVX_INSTR psubusb, 0, 0, 0
1060 AVX_INSTR psubusw, 0, 0, 0
1061 AVX_INSTR punpckhbw, 0, 0, 0
1062 AVX_INSTR punpckhwd, 0, 0, 0
1063 AVX_INSTR punpckhdq, 0, 0, 0
1064 AVX_INSTR punpckhqdq, 0, 0, 0
1065 AVX_INSTR punpcklbw, 0, 0, 0
1066 AVX_INSTR punpcklwd, 0, 0, 0
1067 AVX_INSTR punpckldq, 0, 0, 0
1068 AVX_INSTR punpcklqdq, 0, 0, 0
1069 AVX_INSTR pxor, 0, 0, 1
1070 AVX_INSTR shufps, 1, 1, 0
1071 AVX_INSTR subpd, 1, 0, 0
1072 AVX_INSTR subps, 1, 0, 0
1073 AVX_INSTR subsd, 1, 0, 0
1074 AVX_INSTR subss, 1, 0, 0
1075 AVX_INSTR unpckhpd, 1, 0, 0
1076 AVX_INSTR unpckhps, 1, 0, 0
1077 AVX_INSTR unpcklpd, 1, 0, 0
1078 AVX_INSTR unpcklps, 1, 0, 0
1079 AVX_INSTR xorpd, 1, 0, 1
1080 AVX_INSTR xorps, 1, 0, 1
1081
1082 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN
1083 AVX_INSTR pfadd, 1, 0, 1
1084 AVX_INSTR pfsub, 1, 0, 0
1085 AVX_INSTR pfmul, 1, 0, 1
1086
1087 ; base-4 constants for shuffles
1088 %assign i 0
1089 %rep 256
1090 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1091 %if j < 10
1092 CAT_XDEFINE q000, j, i
1093 %elif j < 100
1094 CAT_XDEFINE q00, j, i
1095 %elif j < 1000
1096 CAT_XDEFINE q0, j, i
1097 %else
1098 CAT_XDEFINE q, j, i
1099 %endif
1100 %assign i i+1
1101 %endrep
1102 %undef i
1103 %undef j
1104
1105 %macro FMA_INSTR 3
1106 %macro %1 4-7 %1, %2, %3
1107 %if cpuflag(xop)
1108 v%5 %1, %2, %3, %4
1109 %else
1110 %6 %1, %2, %3
1111 %7 %1, %4
1112 %endif
1113 %endmacro
1114 %endmacro
1115
1116 FMA_INSTR pmacsdd, pmulld, paddd
1117 FMA_INSTR pmacsww, pmullw, paddw
1118 FMA_INSTR pmadcswd, pmaddwd, paddd
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698