source/libvpx/third_party/x86inc/x86inc.asm - Issue 11555023: libvpx: Add VP9 decoder.

Side by Side Diff: source/libvpx/third_party/x86inc/x86inc.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 ;*****************************************************************************

	2 ;* x86inc.asm: x264asm abstraction layer

	3 ;*****************************************************************************

	4 ;* Copyright (C) 2005-2012 x264 project

	5 ;*

	6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>

	7 ;* Anton Mitrofanov <BugMaster@narod.ru>

	8 ;* Jason Garrett-Glaser <darkshikari@gmail.com>

	9 ;* Henrik Gramner <hengar-6@student.ltu.se>

	10 ;*

	11 ;* Permission to use, copy, modify, and/or distribute this software for any

	12 ;* purpose with or without fee is hereby granted, provided that the above

	13 ;* copyright notice and this permission notice appear in all copies.

	14 ;*

	15 ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES

	16 ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF

	17 ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR

	18 ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES

	19 ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN

	20 ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF

	21 ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

	22 ;*****************************************************************************

	23

	24 ; This is a header file for the x264ASM assembly language, which uses

	25 ; NASM/YASM syntax combined with a large number of macros to provide easy

	26 ; abstraction between different calling conventions (x86_32, win64, linux64).

	27 ; It also has various other useful features to simplify writing the kind of

	28 ; DSP functions that are most often used in x264.

	29

	30 ; Unlike the rest of x264, this file is available under an ISC license, as it

	31 ; has significant usefulness outside of x264 and we want it to be available

	32 ; to the largest audience possible. Of course, if you modify it for your own

	33 ; purposes to add a new feature, we strongly encourage contributing a patch

	34 ; as this feature might be useful for others as well. Send patches or ideas

	35 ; to x264-devel@videolan.org .

	36

	37 %include "vpx_config.asm"

	38

	39 %define program_name vp9

	40

	41

	42 %define UNIX64 0

	43 %define WIN64 0

	44 %if ARCH_X86_64

	45 %ifidn __OUTPUT_FORMAT__,win32

	46 %define WIN64 1

	47 %elifidn __OUTPUT_FORMAT__,win64

	48 %define WIN64 1

	49 %elifidn __OUTPUT_FORMAT__,x64

	50 %define WIN64 1

	51 %else

	52 %define UNIX64 1

	53 %endif

	54 %endif

	55

	56 %ifidn __OUTPUT_FORMAT__,elf32

	57 %define mangle(x) x

	58 %elifidn __OUTPUT_FORMAT__,elf64

	59 %define mangle(x) x

	60 %elifidn __OUTPUT_FORMAT__,elf

	61 %define mangle(x) x

	62 %elifidn __OUTPUT_FORMAT__,x64

	63 %define mangle(x) x

	64 %else

	65 %define mangle(x) _ %+ x

	66 %endif

	67

	68 ; FIXME: All of the 64bit asm functions that take a stride as an argument

	69 ; via register, assume that the high dword of that register is filled with 0.

	70 ; This is true in practice (since we never do any 64bit arithmetic on strides,

	71 ; and x264's strides are all positive), but is not guaranteed by the ABI.

	72

	73 ; Name of the .rodata section.

	74 ; Kludge: Something on OS X fails to align .rodata even given an align attribute ,

	75 ; so use a different read-only section.

	76 %macro SECTION_RODATA 0-1 16

	77 %ifidn __OUTPUT_FORMAT__,macho64

	78 SECTION .text align=%1

	79 %elifidn __OUTPUT_FORMAT__,macho

	80 SECTION .text align=%1

	81 fakegot:

	82 %elifidn __OUTPUT_FORMAT__,aout

	83 section .text

	84 %else

	85 SECTION .rodata align=%1

	86 %endif

	87 %endmacro

	88

	89 ; aout does not support align=

	90 %macro SECTION_TEXT 0-1 16

	91 %ifidn __OUTPUT_FORMAT__,aout

	92 SECTION .text

	93 %else

	94 SECTION .text align=%1

	95 %endif

	96 %endmacro

	97

	98 %if WIN64

	99 %define PIC

	100 %elifidn __OUTPUT_FORMAT__,macho64

	101 %define PIC

	102 %elif ARCH_X86_64 == 0

	103 ; x86_32 doesn't require PIC.

	104 ; Some distros prefer shared objects to be PIC, but nothing breaks if

	105 ; the code contains a few textrels, so we'll skip that complexity.

	106 %undef PIC

	107 %elif CONFIG_PIC

	108 %define PIC

	109 %endif

	110 %ifdef PIC

	111 default rel

	112 %endif

	113

	114 ; Always use long nops (reduces 0x90 spam in disassembly on x86_32)

	115 CPU amdnop

	116

	117 ; Macros to eliminate most code duplication between x86_32 and x86_64:

	118 ; Currently this works only for leaf functions which load all their arguments

	119 ; into registers at the start, and make no other use of the stack. Luckily that

	120 ; covers most of x264's asm.

	121

	122 ; PROLOGUE:

	123 ; %1 = number of arguments. loads them from stack if needed.

	124 ; %2 = number of registers used. pushes callee-saved regs if needed.

	125 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.

	126 ; %4 = list of names to define to registers

	127 ; PROLOGUE can also be invoked by adding the same options to cglobal

	128

	129 ; e.g.

	130 ; cglobal foo, 2,3,0, dst, src, tmp

	131 ; declares a function (foo), taking two args (dst and src) and one local variabl e (tmp)

	132

	133 ; TODO Some functions can use some args directly from the stack. If they're the

	134 ; last args then you can just not declare them, but if they're in the middle

	135 ; we need more flexible macro.

	136

	137 ; RET:

	138 ; Pops anything that was pushed by PROLOGUE, and returns.

	139

	140 ; REP_RET:

	141 ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons

	142 ; which are slow when a normal ret follows a branch.

	143

	144 ; registers:

	145 ; rN and rNq are the native-size register holding function argument N

	146 ; rNd, rNw, rNb are dword, word, and byte size

	147 ; rNm is the original location of arg N (a register or on the stack), dword

	148 ; rNmp is native size

	149

	150 %macro DECLARE_REG 5-6

	151 %define r%1q %2

	152 %define r%1d %3

	153 %define r%1w %4

	154 %define r%1b %5

	155 %if %0 == 5

	156 %define r%1m %3

	157 %define r%1mp %2

	158 %elif ARCH_X86_64 ; memory

	159 %define r%1m [rsp + stack_offset + %6]

	160 %define r%1mp qword r %+ %1m

	161 %else

	162 %define r%1m [esp + stack_offset + %6]

	163 %define r%1mp dword r %+ %1m

	164 %endif

	165 %define r%1 %2

	166 %endmacro

	167

	168 %macro DECLARE_REG_SIZE 2

	169 %define r%1q r%1

	170 %define e%1q r%1

	171 %define r%1d e%1

	172 %define e%1d e%1

	173 %define r%1w %1

	174 %define e%1w %1

	175 %define r%1b %2

	176 %define e%1b %2

	177 %if ARCH_X86_64 == 0

	178 %define r%1 e%1

	179 %endif

	180 %endmacro

	181

	182 DECLARE_REG_SIZE ax, al

	183 DECLARE_REG_SIZE bx, bl

	184 DECLARE_REG_SIZE cx, cl

	185 DECLARE_REG_SIZE dx, dl

	186 DECLARE_REG_SIZE si, sil

	187 DECLARE_REG_SIZE di, dil

	188 DECLARE_REG_SIZE bp, bpl

	189

	190 ; t# defines for when per-arch register allocation is more complex than just fun ction arguments

	191

	192 %macro DECLARE_REG_TMP 1-*

	193 %assign %%i 0

	194 %rep %0

	195 CAT_XDEFINE t, %%i, r%1

	196 %assign %%i %%i+1

	197 %rotate 1

	198 %endrep

	199 %endmacro

	200

	201 %macro DECLARE_REG_TMP_SIZE 0-*

	202 %rep %0

	203 %define t%1q t%1 %+ q

	204 %define t%1d t%1 %+ d

	205 %define t%1w t%1 %+ w

	206 %define t%1b t%1 %+ b

	207 %rotate 1

	208 %endrep

	209 %endmacro

	210

	211 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14

	212

	213 %if ARCH_X86_64

	214 %define gprsize 8

	215 %else

	216 %define gprsize 4

	217 %endif

	218

	219 %macro PUSH 1

	220 push %1

	221 %assign stack_offset stack_offset+gprsize

	222 %endmacro

	223

	224 %macro POP 1

	225 pop %1

	226 %assign stack_offset stack_offset-gprsize

	227 %endmacro

	228

	229 %macro PUSH_IF_USED 1-*

	230 %rep %0

	231 %if %1 < regs_used

	232 PUSH r%1

	233 %endif

	234 %rotate 1

	235 %endrep

	236 %endmacro

	237

	238 %macro POP_IF_USED 1-*

	239 %rep %0

	240 %if %1 < regs_used

	241 pop r%1

	242 %endif

	243 %rotate 1

	244 %endrep

	245 %endmacro

	246

	247 %macro LOAD_IF_USED 1-*

	248 %rep %0

	249 %if %1 < num_args

	250 mov r%1, r %+ %1 %+ mp

	251 %endif

	252 %rotate 1

	253 %endrep

	254 %endmacro

	255

	256 %macro SUB 2

	257 sub %1, %2

	258 %ifidn %1, rsp

	259 %assign stack_offset stack_offset+(%2)

	260 %endif

	261 %endmacro

	262

	263 %macro ADD 2

	264 add %1, %2

	265 %ifidn %1, rsp

	266 %assign stack_offset stack_offset-(%2)

	267 %endif

	268 %endmacro

	269

	270 %macro movifnidn 2

	271 %ifnidn %1, %2

	272 mov %1, %2

	273 %endif

	274 %endmacro

	275

	276 %macro movsxdifnidn 2

	277 %ifnidn %1, %2

	278 movsxd %1, %2

	279 %endif

	280 %endmacro

	281

	282 %macro ASSERT 1

	283 %if (%1) == 0

	284 %error assert failed

	285 %endif

	286 %endmacro

	287

	288 %macro DEFINE_ARGS 0-*

	289 %ifdef n_arg_names

	290 %assign %%i 0

	291 %rep n_arg_names

	292 CAT_UNDEF arg_name %+ %%i, q

	293 CAT_UNDEF arg_name %+ %%i, d

	294 CAT_UNDEF arg_name %+ %%i, w

	295 CAT_UNDEF arg_name %+ %%i, b

	296 CAT_UNDEF arg_name %+ %%i, m

	297 CAT_UNDEF arg_name %+ %%i, mp

	298 CAT_UNDEF arg_name, %%i

	299 %assign %%i %%i+1

	300 %endrep

	301 %endif

	302

	303 %xdefine %%stack_offset stack_offset

	304 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine

	305 %assign %%i 0

	306 %rep %0

	307 %xdefine %1q r %+ %%i %+ q

	308 %xdefine %1d r %+ %%i %+ d

	309 %xdefine %1w r %+ %%i %+ w

	310 %xdefine %1b r %+ %%i %+ b

	311 %xdefine %1m r %+ %%i %+ m

	312 %xdefine %1mp r %+ %%i %+ mp

	313 CAT_XDEFINE arg_name, %%i, %1

	314 %assign %%i %%i+1

	315 %rotate 1

	316 %endrep

	317 %xdefine stack_offset %%stack_offset

	318 %assign n_arg_names %0

	319 %endmacro

	320

	321 %if WIN64 ; Windows x64 ;=================================================

	322

	323 DECLARE_REG 0, rcx, ecx, cx, cl

	324 DECLARE_REG 1, rdx, edx, dx, dl

	325 DECLARE_REG 2, R8, R8D, R8W, R8B

	326 DECLARE_REG 3, R9, R9D, R9W, R9B

	327 DECLARE_REG 4, R10, R10D, R10W, R10B, 40

	328 DECLARE_REG 5, R11, R11D, R11W, R11B, 48

	329 DECLARE_REG 6, rax, eax, ax, al, 56

	330 DECLARE_REG 7, rdi, edi, di, dil, 64

	331 DECLARE_REG 8, rsi, esi, si, sil, 72

	332 DECLARE_REG 9, rbx, ebx, bx, bl, 80

	333 DECLARE_REG 10, rbp, ebp, bp, bpl, 88

	334 DECLARE_REG 11, R12, R12D, R12W, R12B, 96

	335 DECLARE_REG 12, R13, R13D, R13W, R13B, 104

	336 DECLARE_REG 13, R14, R14D, R14W, R14B, 112

	337 DECLARE_REG 14, R15, R15D, R15W, R15B, 120

	338

	339 %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...

	340 %assign num_args %1

	341 %assign regs_used %2

	342 ASSERT regs_used >= num_args

	343 ASSERT regs_used <= 15

	344 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14

	345 %if mmsize == 8

	346 %assign xmm_regs_used 0

	347 %else

	348 WIN64_SPILL_XMM %3

	349 %endif

	350 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14

	351 DEFINE_ARGS %4

	352 %endmacro

	353

	354 %macro WIN64_SPILL_XMM 1

	355 %assign xmm_regs_used %1

	356 ASSERT xmm_regs_used <= 16

	357 %if xmm_regs_used > 6

	358 SUB rsp, (xmm_regs_used-6)*16+16

	359 %assign %%i xmm_regs_used

	360 %rep (xmm_regs_used-6)

	361 %assign %%i %%i-1

	362 movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i

	363 %endrep

	364 %endif

	365 %endmacro

	366

	367 %macro WIN64_RESTORE_XMM_INTERNAL 1

	368 %if xmm_regs_used > 6

	369 %assign %%i xmm_regs_used

	370 %rep (xmm_regs_used-6)

	371 %assign %%i %%i-1

	372 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]

	373 %endrep

	374 add %1, (xmm_regs_used-6)*16+16

	375 %endif

	376 %endmacro

	377

	378 %macro WIN64_RESTORE_XMM 1

	379 WIN64_RESTORE_XMM_INTERNAL %1

	380 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16

	381 %assign xmm_regs_used 0

	382 %endmacro

	383

	384 %macro RET 0

	385 WIN64_RESTORE_XMM_INTERNAL rsp

	386 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7

	387 ret

	388 %endmacro

	389

	390 %macro REP_RET 0

	391 %if regs_used > 7 \|\| xmm_regs_used > 6

	392 RET

	393 %else

	394 rep ret

	395 %endif

	396 %endmacro

	397

	398 %elif ARCH_X86_64 ; *nix x64 ;=============================================

	399

	400 DECLARE_REG 0, rdi, edi, di, dil

	401 DECLARE_REG 1, rsi, esi, si, sil

	402 DECLARE_REG 2, rdx, edx, dx, dl

	403 DECLARE_REG 3, rcx, ecx, cx, cl

	404 DECLARE_REG 4, R8, R8D, R8W, R8B

	405 DECLARE_REG 5, R9, R9D, R9W, R9B

	406 DECLARE_REG 6, rax, eax, ax, al, 8

	407 DECLARE_REG 7, R10, R10D, R10W, R10B, 16

	408 DECLARE_REG 8, R11, R11D, R11W, R11B, 24

	409 DECLARE_REG 9, rbx, ebx, bx, bl, 32

	410 DECLARE_REG 10, rbp, ebp, bp, bpl, 40

	411 DECLARE_REG 11, R12, R12D, R12W, R12B, 48

	412 DECLARE_REG 12, R13, R13D, R13W, R13B, 56

	413 DECLARE_REG 13, R14, R14D, R14W, R14B, 64

	414 DECLARE_REG 14, R15, R15D, R15W, R15B, 72

	415

	416 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...

	417 %assign num_args %1

	418 %assign regs_used %2

	419 ASSERT regs_used >= num_args

	420 ASSERT regs_used <= 15

	421 PUSH_IF_USED 9, 10, 11, 12, 13, 14

	422 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14

	423 DEFINE_ARGS %4

	424 %endmacro

	425

	426 %macro RET 0

	427 POP_IF_USED 14, 13, 12, 11, 10, 9

	428 ret

	429 %endmacro

	430

	431 %macro REP_RET 0

	432 %if regs_used > 9

	433 RET

	434 %else

	435 rep ret

	436 %endif

	437 %endmacro

	438

	439 %else ; X86_32 ;==============================================================

	440

	441 DECLARE_REG 0, eax, eax, ax, al, 4

	442 DECLARE_REG 1, ecx, ecx, cx, cl, 8

	443 DECLARE_REG 2, edx, edx, dx, dl, 12

	444 DECLARE_REG 3, ebx, ebx, bx, bl, 16

	445 DECLARE_REG 4, esi, esi, si, null, 20

	446 DECLARE_REG 5, edi, edi, di, null, 24

	447 DECLARE_REG 6, ebp, ebp, bp, null, 28

	448 %define rsp esp

	449

	450 %macro DECLARE_ARG 1-*

	451 %rep %0

	452 %define r%1m [esp + stack_offset + 4*%1 + 4]

	453 %define r%1mp dword r%1m

	454 %rotate 1

	455 %endrep

	456 %endmacro

	457

	458 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14

	459

	460 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...

	461 %assign num_args %1

	462 %assign regs_used %2

	463 %if regs_used > 7

	464 %assign regs_used 7

	465 %endif

	466 ASSERT regs_used >= num_args

	467 PUSH_IF_USED 3, 4, 5, 6

	468 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6

	469 DEFINE_ARGS %4

	470 %endmacro

	471

	472 %macro RET 0

	473 POP_IF_USED 6, 5, 4, 3

	474 ret

	475 %endmacro

	476

	477 %macro REP_RET 0

	478 %if regs_used > 3

	479 RET

	480 %else

	481 rep ret

	482 %endif

	483 %endmacro

	484

	485 %endif ;======================================================================

	486

	487 %if WIN64 == 0

	488 %macro WIN64_SPILL_XMM 1

	489 %endmacro

	490 %macro WIN64_RESTORE_XMM 1

	491 %endmacro

	492 %endif

	493

	494 ;=============================================================================

	495 ; arch-independent part

	496 ;=============================================================================

	497

	498 %assign function_align 16

	499

	500 ; Begin a function.

	501 ; Applies any symbol mangling needed for C linkage, and sets up a define such th at

	502 ; subsequent uses of the function name automatically refer to the mangled versio n.

	503 ; Appends cpuflags to the function name if cpuflags has been specified.

	504 %macro cglobal 1-2+ ; name, [PROLOGUE args]

	505 %if %0 == 1

	506 cglobal_internal %1 %+ SUFFIX

	507 %else

	508 cglobal_internal %1 %+ SUFFIX, %2

	509 %endif

	510 %endmacro

	511 %macro cglobal_internal 1-2+

	512 %ifndef cglobaled_%1

	513 %xdefine %1 mangle(program_name %+ _ %+ %1)

	514 %xdefine %1.skip_prologue %1 %+ .skip_prologue

	515 CAT_XDEFINE cglobaled_, %1, 1

	516 %endif

	517 %xdefine current_function %1

	518 %ifidn __OUTPUT_FORMAT__,elf

	519 global %1:function hidden

	520 %elifidn __OUTPUT_FORMAT__,elf32

	521 global %1:function hidden

	522 %elifidn __OUTPUT_FORMAT__,elf64

	523 global %1:function hidden

	524 %else

	525 global %1

	526 %endif

	527 align function_align

	528 %1:

	529 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nic er

	530 %assign stack_offset 0

	531 %if %0 > 1

	532 PROLOGUE %2

	533 %endif

	534 %endmacro

	535

	536 %macro cextern 1

	537 %xdefine %1 mangle(program_name %+ _ %+ %1)

	538 CAT_XDEFINE cglobaled_, %1, 1

	539 extern %1

	540 %endmacro

	541

	542 ; like cextern, but without the prefix

	543 %macro cextern_naked 1

	544 %xdefine %1 mangle(%1)

	545 CAT_XDEFINE cglobaled_, %1, 1

	546 extern %1

	547 %endmacro

	548

	549 %macro const 2+

	550 %xdefine %1 mangle(program_name %+ _ %+ %1)

	551 global %1

	552 %1: %2

	553 %endmacro

	554

	555 ; This is needed for ELF, otherwise the GNU linker assumes the stack is

	556 ; executable by default.

	557 %ifidn __OUTPUT_FORMAT__,elf

	558 SECTION .note.GNU-stack noalloc noexec nowrite progbits

	559 %elifidn __OUTPUT_FORMAT__,elf32

	560 SECTION .note.GNU-stack noalloc noexec nowrite progbits

	561 %elifidn __OUTPUT_FORMAT__,elf64

	562 SECTION .note.GNU-stack noalloc noexec nowrite progbits

	563 %endif

	564

	565 ; cpuflags

	566

	567 %assign cpuflags_mmx (1<<0)

	568 %assign cpuflags_mmx2 (1<<1) \| cpuflags_mmx

	569 %assign cpuflags_3dnow (1<<2) \| cpuflags_mmx

	570 %assign cpuflags_3dnow2 (1<<3) \| cpuflags_3dnow

	571 %assign cpuflags_sse (1<<4) \| cpuflags_mmx2

	572 %assign cpuflags_sse2 (1<<5) \| cpuflags_sse

	573 %assign cpuflags_sse2slow (1<<6) \| cpuflags_sse2

	574 %assign cpuflags_sse3 (1<<7) \| cpuflags_sse2

	575 %assign cpuflags_ssse3 (1<<8) \| cpuflags_sse3

	576 %assign cpuflags_sse4 (1<<9) \| cpuflags_ssse3

	577 %assign cpuflags_sse42 (1<<10)\| cpuflags_sse4

	578 %assign cpuflags_avx (1<<11)\| cpuflags_sse42

	579 %assign cpuflags_xop (1<<12)\| cpuflags_avx

	580 %assign cpuflags_fma4 (1<<13)\| cpuflags_avx

	581

	582 %assign cpuflags_cache32 (1<<16)

	583 %assign cpuflags_cache64 (1<<17)

	584 %assign cpuflags_slowctz (1<<18)

	585 %assign cpuflags_lzcnt (1<<19)

	586 %assign cpuflags_misalign (1<<20)

	587 %assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant

	588 %assign cpuflags_atom (1<<22)

	589

	590 %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))

	591 %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))

	592

	593 ; Takes up to 2 cpuflags from the above list.

	594 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the speci fied cpu.

	595 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_M MX &co.

	596 %macro INIT_CPUFLAGS 0-2

	597 %if %0 >= 1

	598 %xdefine cpuname %1

	599 %assign cpuflags cpuflags_%1

	600 %if %0 >= 2

	601 %xdefine cpuname %1_%2

	602 %assign cpuflags cpuflags \| cpuflags_%2

	603 %endif

	604 %xdefine SUFFIX _ %+ cpuname

	605 %if cpuflag(avx)

	606 %assign avx_enabled 1

	607 %endif

	608 %if mmsize == 16 && notcpuflag(sse2)

	609 %define mova movaps

	610 %define movu movups

	611 %define movnta movntps

	612 %endif

	613 %if cpuflag(aligned)

	614 %define movu mova

	615 %elifidn %1, sse3

	616 %define movu lddqu

	617 %endif

	618 %else

	619 %xdefine SUFFIX

	620 %undef cpuname

	621 %undef cpuflags

	622 %endif

	623 %endmacro

	624

	625 ; merge mmx and sse*

	626

	627 %macro CAT_XDEFINE 3

	628 %xdefine %1%2 %3

	629 %endmacro

	630

	631 %macro CAT_UNDEF 2

	632 %undef %1%2

	633 %endmacro

	634

	635 %macro INIT_MMX 0-1+

	636 %assign avx_enabled 0

	637 %define RESET_MM_PERMUTATION INIT_MMX %1

	638 %define mmsize 8

	639 %define num_mmregs 8

	640 %define mova movq

	641 %define movu movq

	642 %define movh movd

	643 %define movnta movntq

	644 %assign %%i 0

	645 %rep 8

	646 CAT_XDEFINE m, %%i, mm %+ %%i

	647 CAT_XDEFINE nmm, %%i, %%i

	648 %assign %%i %%i+1

	649 %endrep

	650 %rep 8

	651 CAT_UNDEF m, %%i

	652 CAT_UNDEF nmm, %%i

	653 %assign %%i %%i+1

	654 %endrep

	655 INIT_CPUFLAGS %1

	656 %endmacro

	657

	658 %macro INIT_XMM 0-1+

	659 %assign avx_enabled 0

	660 %define RESET_MM_PERMUTATION INIT_XMM %1

	661 %define mmsize 16

	662 %define num_mmregs 8

	663 %if ARCH_X86_64

	664 %define num_mmregs 16

	665 %endif

	666 %define mova movdqa

	667 %define movu movdqu

	668 %define movh movq

	669 %define movnta movntdq

	670 %assign %%i 0

	671 %rep num_mmregs

	672 CAT_XDEFINE m, %%i, xmm %+ %%i

	673 CAT_XDEFINE nxmm, %%i, %%i

	674 %assign %%i %%i+1

	675 %endrep

	676 INIT_CPUFLAGS %1

	677 %endmacro

	678

	679 ; FIXME: INIT_AVX can be replaced by INIT_XMM avx

	680 %macro INIT_AVX 0

	681 INIT_XMM

	682 %assign avx_enabled 1

	683 %define PALIGNR PALIGNR_SSSE3

	684 %define RESET_MM_PERMUTATION INIT_AVX

	685 %endmacro

	686

	687 %macro INIT_YMM 0-1+

	688 %assign avx_enabled 1

	689 %define RESET_MM_PERMUTATION INIT_YMM %1

	690 %define mmsize 32

	691 %define num_mmregs 8

	692 %if ARCH_X86_64

	693 %define num_mmregs 16

	694 %endif

	695 %define mova vmovaps

	696 %define movu vmovups

	697 %undef movh

	698 %define movnta vmovntps

	699 %assign %%i 0

	700 %rep num_mmregs

	701 CAT_XDEFINE m, %%i, ymm %+ %%i

	702 CAT_XDEFINE nymm, %%i, %%i

	703 %assign %%i %%i+1

	704 %endrep

	705 INIT_CPUFLAGS %1

	706 %endmacro

	707

	708 INIT_XMM

	709

	710 ; I often want to use macros that permute their arguments. e.g. there's no

	711 ; efficient way to implement butterfly or transpose or dct without swapping some

	712 ; arguments.

	713 ;

	714 ; I would like to not have to manually keep track of the permutations:

	715 ; If I insert a permutation in the middle of a function, it should automatically

	716 ; change everything that follows. For more complex macros I may also have multip le

	717 ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutati ons.

	718 ;

	719 ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that

	720 ; permutes its arguments. It's equivalent to exchanging the contents of the

	721 ; registers, except that this way you exchange the register names instead, so it

	722 ; doesn't cost any cycles.

	723

	724 %macro PERMUTE 2-* ; takes a list of pairs to swap

	725 %rep %0/2

	726 %xdefine tmp%2 m%2

	727 %xdefine ntmp%2 nm%2

	728 %rotate 2

	729 %endrep

	730 %rep %0/2

	731 %xdefine m%1 tmp%2

	732 %xdefine nm%1 ntmp%2

	733 %undef tmp%2

	734 %undef ntmp%2

	735 %rotate 2

	736 %endrep

	737 %endmacro

	738

	739 %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)

	740 %rep %0-1

	741 %ifdef m%1

	742 %xdefine tmp m%1

	743 %xdefine m%1 m%2

	744 %xdefine m%2 tmp

	745 CAT_XDEFINE n, m%1, %1

	746 CAT_XDEFINE n, m%2, %2

	747 %else

	748 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the origina l numbers here.

	749 ; Be careful using this mode in nested macros though, as in some cases there may be

	750 ; other copies of m# that have already been dereferenced and don't get updat ed correctly.

	751 %xdefine %%n1 n %+ %1

	752 %xdefine %%n2 n %+ %2

	753 %xdefine tmp m %+ %%n1

	754 CAT_XDEFINE m, %%n1, m %+ %%n2

	755 CAT_XDEFINE m, %%n2, tmp

	756 CAT_XDEFINE n, m %+ %%n1, %%n1

	757 CAT_XDEFINE n, m %+ %%n2, %%n2

	758 %endif

	759 %undef tmp

	760 %rotate 1

	761 %endrep

	762 %endmacro

	763

	764 ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later

	765 ; calls to that function will automatically load the permutation, so values can

	766 ; be returned in mmregs.

	767 %macro SAVE_MM_PERMUTATION 0-1

	768 %if %0

	769 %xdefine %%f %1_m

	770 %else

	771 %xdefine %%f current_function %+ _m

	772 %endif

	773 %assign %%i 0

	774 %rep num_mmregs

	775 CAT_XDEFINE %%f, %%i, m %+ %%i

	776 %assign %%i %%i+1

	777 %endrep

	778 %endmacro

	779

	780 %macro LOAD_MM_PERMUTATION 1 ; name to load from

	781 %ifdef %1_m0

	782 %assign %%i 0

	783 %rep num_mmregs

	784 CAT_XDEFINE m, %%i, %1_m %+ %%i

	785 CAT_XDEFINE n, m %+ %%i, %%i

	786 %assign %%i %%i+1

	787 %endrep

	788 %endif

	789 %endmacro

	790

	791 ; Append cpuflags to the callee's name iff the appended name is known and the pl ain name isn't

	792 %macro call 1

	793 call_internal %1, %1 %+ SUFFIX

	794 %endmacro

	795 %macro call_internal 2

	796 %xdefine %%i %1

	797 %ifndef cglobaled_%1

	798 %ifdef cglobaled_%2

	799 %xdefine %%i %2

	800 %endif

	801 %endif

	802 call %%i

	803 LOAD_MM_PERMUTATION %%i

	804 %endmacro

	805

	806 ; Substitutions that reduce instruction size but are functionally equivalent

	807 %macro add 2

	808 %ifnum %2

	809 %if %2==128

	810 sub %1, -128

	811 %else

	812 add %1, %2

	813 %endif

	814 %else

	815 add %1, %2

	816 %endif

	817 %endmacro

	818

	819 %macro sub 2

	820 %ifnum %2

	821 %if %2==128

	822 add %1, -128

	823 %else

	824 sub %1, %2

	825 %endif

	826 %else

	827 sub %1, %2

	828 %endif

	829 %endmacro

	830

	831 ;=============================================================================

	832 ; AVX abstraction layer

	833 ;=============================================================================

	834

	835 %assign i 0

	836 %rep 16

	837 %if i < 8

	838 CAT_XDEFINE sizeofmm, i, 8

	839 %endif

	840 CAT_XDEFINE sizeofxmm, i, 16

	841 CAT_XDEFINE sizeofymm, i, 32

	842 %assign i i+1

	843 %endrep

	844 %undef i

	845

	846 ;%1 == instruction

	847 ;%2 == 1 if float, 0 if int

	848 ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)

	849 ;%4 == number of operands given

	850 ;%5+: operands

	851 %macro RUN_AVX_INSTR 6-7+

	852 %ifid %5

	853 %define %%size sizeof%5

	854 %else

	855 %define %%size mmsize

	856 %endif

	857 %if %%size==32

	858 %if %0 >= 7

	859 v%1 %5, %6, %7

	860 %else

	861 v%1 %5, %6

	862 %endif

	863 %else

	864 %if %%size==8

	865 %define %%regmov movq

	866 %elif %2

	867 %define %%regmov movaps

	868 %else

	869 %define %%regmov movdqa

	870 %endif

	871

	872 %if %4>=3+%3

	873 %ifnidn %5, %6

	874 %if avx_enabled && sizeof%5==16

	875 v%1 %5, %6, %7

	876 %else

	877 %%regmov %5, %6

	878 %1 %5, %7

	879 %endif

	880 %else

	881 %1 %5, %7

	882 %endif

	883 %elif %3

	884 %1 %5, %6, %7

	885 %else

	886 %1 %5, %6

	887 %endif

	888 %endif

	889 %endmacro

	890

	891 ; 3arg AVX ops with a memory arg can only have it in src2,

	892 ; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).

	893 ; So, if the op is symmetric and the wrong one is memory, swap them.

	894 %macro RUN_AVX_INSTR1 8

	895 %assign %%swap 0

	896 %if avx_enabled

	897 %ifnid %6

	898 %assign %%swap 1

	899 %endif

	900 %elifnidn %5, %6

	901 %ifnid %7

	902 %assign %%swap 1

	903 %endif

	904 %endif

	905 %if %%swap && %3 == 0 && %8 == 1

	906 RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6

	907 %else

	908 RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7

	909 %endif

	910 %endmacro

	911

	912 ;%1 == instruction

	913 ;%2 == 1 if float, 0 if int

	914 ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm)

	915 ;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not

	916 %macro AVX_INSTR 4

	917 %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4

	918 %ifidn %3, fnord

	919 RUN_AVX_INSTR %6, %7, %8, 2, %1, %2

	920 %elifidn %4, fnord

	921 RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9

	922 %elifidn %5, fnord

	923 RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4

	924 %else

	925 RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5

	926 %endif

	927 %endmacro

	928 %endmacro

	929

	930 AVX_INSTR addpd, 1, 0, 1

	931 AVX_INSTR addps, 1, 0, 1

	932 AVX_INSTR addsd, 1, 0, 1

	933 AVX_INSTR addss, 1, 0, 1

	934 AVX_INSTR addsubpd, 1, 0, 0

	935 AVX_INSTR addsubps, 1, 0, 0

	936 AVX_INSTR andpd, 1, 0, 1

	937 AVX_INSTR andps, 1, 0, 1

	938 AVX_INSTR andnpd, 1, 0, 0

	939 AVX_INSTR andnps, 1, 0, 0

	940 AVX_INSTR blendpd, 1, 0, 0

	941 AVX_INSTR blendps, 1, 0, 0

	942 AVX_INSTR blendvpd, 1, 0, 0

	943 AVX_INSTR blendvps, 1, 0, 0

	944 AVX_INSTR cmppd, 1, 0, 0

	945 AVX_INSTR cmpps, 1, 0, 0

	946 AVX_INSTR cmpsd, 1, 0, 0

	947 AVX_INSTR cmpss, 1, 0, 0

	948 AVX_INSTR cvtdq2ps, 1, 0, 0

	949 AVX_INSTR cvtps2dq, 1, 0, 0

	950 AVX_INSTR divpd, 1, 0, 0

	951 AVX_INSTR divps, 1, 0, 0

	952 AVX_INSTR divsd, 1, 0, 0

	953 AVX_INSTR divss, 1, 0, 0

	954 AVX_INSTR dppd, 1, 1, 0

	955 AVX_INSTR dpps, 1, 1, 0

	956 AVX_INSTR haddpd, 1, 0, 0

	957 AVX_INSTR haddps, 1, 0, 0

	958 AVX_INSTR hsubpd, 1, 0, 0

	959 AVX_INSTR hsubps, 1, 0, 0

	960 AVX_INSTR maxpd, 1, 0, 1

	961 AVX_INSTR maxps, 1, 0, 1

	962 AVX_INSTR maxsd, 1, 0, 1

	963 AVX_INSTR maxss, 1, 0, 1

	964 AVX_INSTR minpd, 1, 0, 1

	965 AVX_INSTR minps, 1, 0, 1

	966 AVX_INSTR minsd, 1, 0, 1

	967 AVX_INSTR minss, 1, 0, 1

	968 AVX_INSTR movhlps, 1, 0, 0

	969 AVX_INSTR movlhps, 1, 0, 0

	970 AVX_INSTR movsd, 1, 0, 0

	971 AVX_INSTR movss, 1, 0, 0

	972 AVX_INSTR mpsadbw, 0, 1, 0

	973 AVX_INSTR mulpd, 1, 0, 1

	974 AVX_INSTR mulps, 1, 0, 1

	975 AVX_INSTR mulsd, 1, 0, 1

	976 AVX_INSTR mulss, 1, 0, 1

	977 AVX_INSTR orpd, 1, 0, 1

	978 AVX_INSTR orps, 1, 0, 1

	979 AVX_INSTR packsswb, 0, 0, 0

	980 AVX_INSTR packssdw, 0, 0, 0

	981 AVX_INSTR packuswb, 0, 0, 0

	982 AVX_INSTR packusdw, 0, 0, 0

	983 AVX_INSTR paddb, 0, 0, 1

	984 AVX_INSTR paddw, 0, 0, 1

	985 AVX_INSTR paddd, 0, 0, 1

	986 AVX_INSTR paddq, 0, 0, 1

	987 AVX_INSTR paddsb, 0, 0, 1

	988 AVX_INSTR paddsw, 0, 0, 1

	989 AVX_INSTR paddusb, 0, 0, 1

	990 AVX_INSTR paddusw, 0, 0, 1

	991 AVX_INSTR palignr, 0, 1, 0

	992 AVX_INSTR pand, 0, 0, 1

	993 AVX_INSTR pandn, 0, 0, 0

	994 AVX_INSTR pavgb, 0, 0, 1

	995 AVX_INSTR pavgw, 0, 0, 1

	996 AVX_INSTR pblendvb, 0, 0, 0

	997 AVX_INSTR pblendw, 0, 1, 0

	998 AVX_INSTR pcmpestri, 0, 0, 0

	999 AVX_INSTR pcmpestrm, 0, 0, 0

	1000 AVX_INSTR pcmpistri, 0, 0, 0

	1001 AVX_INSTR pcmpistrm, 0, 0, 0

	1002 AVX_INSTR pcmpeqb, 0, 0, 1

	1003 AVX_INSTR pcmpeqw, 0, 0, 1

	1004 AVX_INSTR pcmpeqd, 0, 0, 1

	1005 AVX_INSTR pcmpeqq, 0, 0, 1

	1006 AVX_INSTR pcmpgtb, 0, 0, 0

	1007 AVX_INSTR pcmpgtw, 0, 0, 0

	1008 AVX_INSTR pcmpgtd, 0, 0, 0

	1009 AVX_INSTR pcmpgtq, 0, 0, 0

	1010 AVX_INSTR phaddw, 0, 0, 0

	1011 AVX_INSTR phaddd, 0, 0, 0

	1012 AVX_INSTR phaddsw, 0, 0, 0

	1013 AVX_INSTR phsubw, 0, 0, 0

	1014 AVX_INSTR phsubd, 0, 0, 0

	1015 AVX_INSTR phsubsw, 0, 0, 0

	1016 AVX_INSTR pmaddwd, 0, 0, 1

	1017 AVX_INSTR pmaddubsw, 0, 0, 0

	1018 AVX_INSTR pmaxsb, 0, 0, 1

	1019 AVX_INSTR pmaxsw, 0, 0, 1

	1020 AVX_INSTR pmaxsd, 0, 0, 1

	1021 AVX_INSTR pmaxub, 0, 0, 1

	1022 AVX_INSTR pmaxuw, 0, 0, 1

	1023 AVX_INSTR pmaxud, 0, 0, 1

	1024 AVX_INSTR pminsb, 0, 0, 1

	1025 AVX_INSTR pminsw, 0, 0, 1

	1026 AVX_INSTR pminsd, 0, 0, 1

	1027 AVX_INSTR pminub, 0, 0, 1

	1028 AVX_INSTR pminuw, 0, 0, 1

	1029 AVX_INSTR pminud, 0, 0, 1

	1030 AVX_INSTR pmulhuw, 0, 0, 1

	1031 AVX_INSTR pmulhrsw, 0, 0, 1

	1032 AVX_INSTR pmulhw, 0, 0, 1

	1033 AVX_INSTR pmullw, 0, 0, 1

	1034 AVX_INSTR pmulld, 0, 0, 1

	1035 AVX_INSTR pmuludq, 0, 0, 1

	1036 AVX_INSTR pmuldq, 0, 0, 1

	1037 AVX_INSTR por, 0, 0, 1

	1038 AVX_INSTR psadbw, 0, 0, 1

	1039 AVX_INSTR pshufb, 0, 0, 0

	1040 AVX_INSTR psignb, 0, 0, 0

	1041 AVX_INSTR psignw, 0, 0, 0

	1042 AVX_INSTR psignd, 0, 0, 0

	1043 AVX_INSTR psllw, 0, 0, 0

	1044 AVX_INSTR pslld, 0, 0, 0

	1045 AVX_INSTR psllq, 0, 0, 0

	1046 AVX_INSTR pslldq, 0, 0, 0

	1047 AVX_INSTR psraw, 0, 0, 0

	1048 AVX_INSTR psrad, 0, 0, 0

	1049 AVX_INSTR psrlw, 0, 0, 0

	1050 AVX_INSTR psrld, 0, 0, 0

	1051 AVX_INSTR psrlq, 0, 0, 0

	1052 AVX_INSTR psrldq, 0, 0, 0

	1053 AVX_INSTR psubb, 0, 0, 0

	1054 AVX_INSTR psubw, 0, 0, 0

	1055 AVX_INSTR psubd, 0, 0, 0

	1056 AVX_INSTR psubq, 0, 0, 0

	1057 AVX_INSTR psubsb, 0, 0, 0

	1058 AVX_INSTR psubsw, 0, 0, 0

	1059 AVX_INSTR psubusb, 0, 0, 0

	1060 AVX_INSTR psubusw, 0, 0, 0

	1061 AVX_INSTR punpckhbw, 0, 0, 0

	1062 AVX_INSTR punpckhwd, 0, 0, 0

	1063 AVX_INSTR punpckhdq, 0, 0, 0

	1064 AVX_INSTR punpckhqdq, 0, 0, 0

	1065 AVX_INSTR punpcklbw, 0, 0, 0

	1066 AVX_INSTR punpcklwd, 0, 0, 0

	1067 AVX_INSTR punpckldq, 0, 0, 0

	1068 AVX_INSTR punpcklqdq, 0, 0, 0

	1069 AVX_INSTR pxor, 0, 0, 1

	1070 AVX_INSTR shufps, 1, 1, 0

	1071 AVX_INSTR subpd, 1, 0, 0

	1072 AVX_INSTR subps, 1, 0, 0

	1073 AVX_INSTR subsd, 1, 0, 0

	1074 AVX_INSTR subss, 1, 0, 0

	1075 AVX_INSTR unpckhpd, 1, 0, 0

	1076 AVX_INSTR unpckhps, 1, 0, 0

	1077 AVX_INSTR unpcklpd, 1, 0, 0

	1078 AVX_INSTR unpcklps, 1, 0, 0

	1079 AVX_INSTR xorpd, 1, 0, 1

	1080 AVX_INSTR xorps, 1, 0, 1

	1081

	1082 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN

	1083 AVX_INSTR pfadd, 1, 0, 1

	1084 AVX_INSTR pfsub, 1, 0, 0

	1085 AVX_INSTR pfmul, 1, 0, 1

	1086

	1087 ; base-4 constants for shuffles

	1088 %assign i 0

	1089 %rep 256

	1090 %assign j ((i>>6)&3)1000 + ((i>>4)&3)100 + ((i>>2)&3)*10 + (i&3)

	1091 %if j < 10

	1092 CAT_XDEFINE q000, j, i

	1093 %elif j < 100

	1094 CAT_XDEFINE q00, j, i

	1095 %elif j < 1000

	1096 CAT_XDEFINE q0, j, i

	1097 %else

	1098 CAT_XDEFINE q, j, i

	1099 %endif

	1100 %assign i i+1

	1101 %endrep

	1102 %undef i

	1103 %undef j

	1104

	1105 %macro FMA_INSTR 3

	1106 %macro %1 4-7 %1, %2, %3

	1107 %if cpuflag(xop)

	1108 v%5 %1, %2, %3, %4

	1109 %else

	1110 %6 %1, %2, %3

	1111 %7 %1, %4

	1112 %endif

	1113 %endmacro

	1114 %endmacro

	1115

	1116 FMA_INSTR pmacsdd, pmulld, paddd

	1117 FMA_INSTR pmacsww, pmullw, paddw

	1118 FMA_INSTR pmadcswd, pmaddwd, paddd

OLD	NEW

« libvpx.gyp ('K') | « source/libvpx/third_party/x86inc/README.webm ('k') | source/libvpx/tools/all_builds.py » ('j') | no next file with comments »