src/IceTargetLoweringX8632.cpp - Issue 444443002: Subzero: Align the stack at the point of function calls.

Unified Diff: src/IceTargetLoweringX8632.cpp

Issue 444443002: Subzero: Align the stack at the point of function calls. (Closed) Base URL: https://gerrit.chromium.org/gerrit/p/native_client/pnacl-subzero.git@master

Patch Set: Comments round 2 Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/IceTargetLoweringX8632.cpp

diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp

index 8e56a10bbb83adba9e884821290dda0f289b6c54..f4429416f0feb62fbbbcbddbe089cb799590d450 100644

--- a/src/IceTargetLoweringX8632.cpp

+++ b/src/IceTargetLoweringX8632.cpp

@@ -121,9 +121,21 @@ Type getInVectorElementType(Type Ty) {

}

// The maximum number of arguments to pass in XMM registers

-const unsigned X86_MAX_XMM_ARGS = 4;

+const uint32_t X86_MAX_XMM_ARGS = 4;

// The number of bits in a byte

-const unsigned X86_CHAR_BIT = 8;

+const uint32_t X86_CHAR_BIT = 8;

+// Stack alignment

+const uint32_t X86_STACK_ALIGNMENT_BYTES = 16;

+// Size of the return address on the stack

+const uint32_t X86_RET_IP_SIZE_BYTES = 4;

+// Value is a size in bytes. Return Value adjusted to the next highest

+// multiple of the stack alignment.

+uint32_t applyStackAlignment(uint32_t Value) {

+ // power of 2

+ assert((X86_STACK_ALIGNMENT_BYTES & (X86_STACK_ALIGNMENT_BYTES - 1)) == 0);

+ return (Value + X86_STACK_ALIGNMENT_BYTES - 1) & -X86_STACK_ALIGNMENT_BYTES;

// Instruction set options

namespace cl = ::llvm::cl;

@@ -248,8 +260,8 @@ void __attribute__((unused)) xMacroIntegrityCheck() {

TargetX8632::TargetX8632(Cfg *Func)

: TargetLowering(Func), InstructionSet(CLInstructionSet),

- IsEbpBasedFrame(false), FrameSizeLocals(0), LocalsSizeBytes(0),

- NextLabelNumber(0), ComputedLiveRanges(false),

+ IsEbpBasedFrame(false), NeedsStackAlignment(false), FrameSizeLocals(0),

+ LocalsSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false),

PhysicalRegisters(VarList(Reg_NUM)) {

// TODO: Don't initialize IntegerRegisters and friends every time.

// Instead, initialize in some sort of static initializer for the

@@ -543,6 +555,9 @@ void TargetX8632::finishArgumentLowering(Variable *Arg, Variable *FramePtr,

finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes);

return;

}

+ if (isVectorType(Ty)) {

+ InArgsSizeBytes = applyStackAlignment(InArgsSizeBytes);

+ }

Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);

InArgsSizeBytes += typeWidthInBytesOnStack(Ty);

if (Arg->hasReg()) {

@@ -570,7 +585,6 @@ void TargetX8632::addProlog(CfgNode *Node) {

// or B.

const bool SimpleCoalescing = true;

size_t InArgsSizeBytes = 0;

- size_t RetIpSizeBytes = 4;

size_t PreservedRegsSizeBytes = 0;

LocalsSizeBytes = 0;

Context.init(Node);

@@ -657,6 +671,13 @@ void TargetX8632::addProlog(CfgNode *Node) {

_mov(ebp, esp);

}

+ if (NeedsStackAlignment) {

+ uint32_t StackSize = applyStackAlignment(

+ X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes + LocalsSizeBytes);

+ LocalsSizeBytes =

+ StackSize - X86_RET_IP_SIZE_BYTES - PreservedRegsSizeBytes;

+ }

// Generate "sub esp, LocalsSizeBytes"

if (LocalsSizeBytes)

_sub(getPhysicalRegister(Reg_esp),

@@ -668,7 +689,7 @@ void TargetX8632::addProlog(CfgNode *Node) {

// for those that were register-allocated. Args are pushed right to

// left, so Arg[0] is closest to the stack/frame pointer.

Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg());

- size_t BasicFrameOffset = PreservedRegsSizeBytes + RetIpSizeBytes;

+ size_t BasicFrameOffset = PreservedRegsSizeBytes + X86_RET_IP_SIZE_BYTES;

if (!IsEbpBasedFrame)

BasicFrameOffset += LocalsSizeBytes;

@@ -959,12 +980,43 @@ llvm::SmallBitVector TargetX8632::getRegisterSet(RegSetMask Include,

void TargetX8632::lowerAlloca(const InstAlloca *Inst) {

IsEbpBasedFrame = true;

- // TODO(sehr,stichnot): align allocated memory, keep stack aligned, minimize

- // the number of adjustments of esp, etc.

+ // Conservatively require the stack to be aligned. Some stack

+ // adjustment operations implemented below assume that the stack is

+ // aligned before the alloca. All the alloca code ensures that the

+ // stack alignment is preserved after the alloca. The stack alignment

+ // restriction can be relaxed in some cases.

+ NeedsStackAlignment = true;

+ // TODO(sehr,stichnot): align allocated memory, minimize the number of

jvoung (off chromium) 2014/08/12 01:36:23 Is the "align allocated memory" part of the TODO n

wala 2014/08/12 02:56:09 Good point, the part about aligning allocas is don

+ // adjustments of esp, etc.

Variable *esp = getPhysicalRegister(Reg_esp);

Operand *TotalSize = legalize(Inst->getSizeInBytes());

Variable *Dest = Inst->getDest();

- _sub(esp, TotalSize);

+ uint32_t AlignmentParam = Inst->getAlignInBytes();

+ // LLVM enforces power of 2 alignment.

+ assert((AlignmentParam & (AlignmentParam - 1)) == 0);

+ assert((X86_STACK_ALIGNMENT_BYTES & (X86_STACK_ALIGNMENT_BYTES - 1)) == 0);

+ uint32_t Alignment = std::max(AlignmentParam, X86_STACK_ALIGNMENT_BYTES);

+ if (Alignment > X86_STACK_ALIGNMENT_BYTES) {

+ _and(esp, Ctx->getConstantInt(IceType_i32, -Alignment));

+ }

+ if (ConstantInteger *ConstantTotalSize =

+ llvm::dyn_cast<ConstantInteger>(TotalSize)) {

+ uint32_t Value = ConstantTotalSize->getValue();

+ // Round Value up to the next highest multiple of the alignment.

+ Value = (Value + Alignment - 1) & -Alignment;

+ _sub(esp, Ctx->getConstantInt(IceType_i32, Value));

+ } else {

+ // Non-constant sizes need to be adjusted to the next highest

+ // multiple of the required alignment at runtime.

+ Variable *T = makeReg(IceType_i32);

+ _mov(T, TotalSize);

+ _add(T, Ctx->getConstantInt(IceType_i32, Alignment - 1));

+ _and(T, Ctx->getConstantInt(IceType_i32, -Alignment));

+ _sub(esp, T);

+ }

_mov(Dest, esp);

}

@@ -1592,51 +1644,78 @@ void TargetX8632::lowerBr(const InstBr *Inst) {

}

void TargetX8632::lowerCall(const InstCall *Instr) {

+ // x86-32 calling convention:

+ //

+ // * At the point before the call, the stack must be aligned to 16

+ // bytes.

+ //

+ // * The first four arguments of vector type, regardless of their

+ // position relative to the other arguments in the argument list, are

+ // placed in registers xmm0 - xmm3.

+ //

+ // * Other arguments are pushed onto the stack in right-to-left order,

+ // such that the left-most argument ends up on the top of the stack at

+ // the lowest memory address.

+ //

+ // * Stack arguments of vector type are aligned to start at the next

+ // highest multiple of 16 bytes. Other stack arguments are aligned to

+ // 4 bytes.

+ //

+ // This intends to match the section "IA-32 Function Calling

+ // Convention" of the document "OS X ABI Function Call Guide" by

+ // Apple.

+ NeedsStackAlignment = true;

+ OperandList XmmArgs;

+ OperandList StackArgs, StackArgLocations;

+ uint32_t ParameterAreaSizeBytes = 0;

// Classify each argument operand according to the location where the

// argument is passed.

- OperandList XmmArgs;

- OperandList StackArgs;

for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {

Operand *Arg = Instr->getArg(i);

- if (isVectorType(Arg->getType()) && XmmArgs.size() < X86_MAX_XMM_ARGS) {

+ Type Ty = Arg->getType();

+ // The PNaCl ABI requires the width of arguments to be at least 32 bits.

+ assert(Ty == IceType_i32 || Ty == IceType_f32 || Ty == IceType_i64 ||

+ Ty == IceType_f64 || isVectorType(Ty));

+ if (isVectorType(Ty) && XmmArgs.size() < X86_MAX_XMM_ARGS) {

XmmArgs.push_back(Arg);

} else {

StackArgs.push_back(Arg);

+ if (isVectorType(Arg->getType())) {

+ ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);

+ }

+ Variable *esp = Func->getTarget()->getPhysicalRegister(Reg_esp);

+ Constant *Loc = Ctx->getConstantInt(IceType_i32, ParameterAreaSizeBytes);

+ StackArgLocations.push_back(OperandX8632Mem::create(Func, Ty, esp, Loc));

+ ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());

}

- // For stack arguments, generate a sequence of push instructions,

- // pushing right to left, keeping track of stack offsets in case a

- // push involves a stack operand and we are using an esp-based frame.

- uint32_t StackOffset = 0;

- // TODO: Consolidate the stack adjustment for function calls by

- // reserving enough space for the arguments only once.

+ // Adjust the parameter area so that the stack is aligned. It is

+ // assumed that the stack is already aligned at the start of the

+ // calling sequence.

+ ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);

+ // Subtract the appropriate amount for the argument area. This also

+ // takes care of setting the stack adjustment during emission.

// TODO: If for some reason the call instruction gets dead-code

// eliminated after lowering, we would need to ensure that the

- // pre-call push instructions and the post-call esp adjustment get

- // eliminated as well.

- for (OperandList::reverse_iterator I = StackArgs.rbegin(),

- E = StackArgs.rend(); I != E; ++I) {

- Operand *Arg = legalize(*I);

- if (Arg->getType() == IceType_i64) {

- _push(hiOperand(Arg));

- _push(loOperand(Arg));

- } else if (Arg->getType() == IceType_f64 || isVectorType(Arg->getType())) {

- // If the Arg turns out to be a memory operand, more than one push

- // instruction is required. This ends up being somewhat clumsy in

- // the current IR, so we use a workaround. Force the operand into

- // a (xmm) register, and then push the register. An xmm register

- // push is actually not possible in x86, but the Push instruction

- // emitter handles this by decrementing the stack pointer and

- // directly writing the xmm register value.

- _push(legalize(Arg, Legal_Reg));

- } else {

- // Otherwise PNaCl requires parameter types to be at least 32-bits.

- assert(Arg->getType() == IceType_f32 || Arg->getType() == IceType_i32);

- _push(Arg);

- }

- StackOffset += typeWidthInBytesOnStack(Arg->getType());

+ // pre-call and the post-call esp adjustment get eliminated as well.

+ if (ParameterAreaSizeBytes) {

+ _adjust_stack(ParameterAreaSizeBytes);

}

+ // Copy arguments that are passed on the stack to the appropriate

+ // stack locations.

+ for (SizeT i = 0, e = StackArgs.size(); i < e; ++i) {

+ lowerStore(InstStore::create(Func, StackArgs[i], StackArgLocations[i]));

+ // TODO: Consider calling postLower() here to reduce the register

+ // pressure associated with using too many infinite weight

+ // temporaries when lowering the call sequence in -Om1 mode.

+ }

// Copy arguments to be passed in registers to the appropriate

// registers.

// TODO: Investigate the impact of lowering arguments passed in

@@ -1700,10 +1779,11 @@ void TargetX8632::lowerCall(const InstCall *Instr) {

if (ReturnRegHi)

Context.insert(InstFakeDef::create(Func, ReturnRegHi));

- // Add the appropriate offset to esp.

- if (StackOffset) {

+ // Add the appropriate offset to esp. The call instruction takes care

+ // of resetting the stack offset during emission.

+ if (ParameterAreaSizeBytes) {

Variable *esp = Func->getTarget()->getPhysicalRegister(Reg_esp);

- _add(esp, Ctx->getConstantInt(IceType_i32, StackOffset));

+ _add(esp, Ctx->getConstantInt(IceType_i32, ParameterAreaSizeBytes));

}

// Insert a register-kill pseudo instruction.

@@ -2182,9 +2262,9 @@ void TargetX8632::lowerExtractElement(const InstExtractElement *Inst) {

} else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {

// Use pshufd and movd/movss.

- // ALIGNHACK: Force vector operands to registers in instructions that

- // require aligned memory operands until support for stack alignment

- // is implemented.

+ // ALIGNHACK: Force vector operands to registers in instructions

+ // that require aligned memory operands until support for data

+ // alignment is implemented.

#define ALIGN_HACK(Vect) legalizeToVar((Vect))

Operand *SourceVectRM =

legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);

@@ -2269,8 +2349,8 @@ void TargetX8632::lowerFcmp(const InstFcmp *Inst) {

Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);

Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);

- // ALIGNHACK: Without support for stack alignment, both operands to

- // cmpps need to be forced into registers. Once support for stack

+ // ALIGNHACK: Without support for data alignment, both operands to

+ // cmpps need to be forced into registers. Once support for data

// alignment is implemented, remove LEGAL_HACK.

#define LEGAL_HACK(Vect) legalizeToVar((Vect))

switch (Condition) {

@@ -2410,8 +2490,8 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) {

}

// TODO: ALIGNHACK: Both operands to compare instructions need to be

- // in registers until stack alignment support is implemented. Once

- // there is support for stack alignment, LEGAL_HACK can be removed.

+ // in registers until data alignment support is implemented. Once

+ // there is support for data alignment, LEGAL_HACK can be removed.

#define LEGAL_HACK(Vect) legalizeToVar((Vect))

Variable *T = makeReg(Ty);

switch (Condition) {

@@ -2631,9 +2711,9 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {

Constant *Mask1Constant = Ctx->getConstantInt(IceType_i8, Mask1[Index - 1]);

Constant *Mask2Constant = Ctx->getConstantInt(IceType_i8, Mask2[Index - 1]);

- // ALIGNHACK: Force vector operands to registers in instructions that

- // require aligned memory operands until support for stack alignment

- // is implemented.

+ // ALIGNHACK: Force vector operands to registers in instructions

+ // that require aligned memory operands until support for data

+ // alignment is implemented.

#define ALIGN_HACK(Vect) legalizeToVar((Vect))

if (Index == 1) {

SourceVectRM = ALIGN_HACK(SourceVectRM);

@@ -2921,7 +3001,8 @@ void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {

}

case Intrinsics::Memset: {

// The value operand needs to be extended to a stack slot size

- // because "push" only works for a specific operand size.

+ // because the PNaCl ABI requires arguments to be at least 32 bits

+ // wide.

Operand *ValOp = Instr->getArg(1);

assert(ValOp->getType() == IceType_i8);

Variable *ValExt = Func->makeVariable(stackSlotType(), Context.getNode());

@@ -3590,9 +3671,9 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) {

Variable *T = makeReg(SrcTy);

Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);

Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);

- // ALIGNHACK: Until stack alignment support is implemented, vector

+ // ALIGNHACK: Until data alignment support is implemented, vector

// instructions need to have vector operands in registers. Once

- // there is support for stack alignment, LEGAL_HACK can be removed.

+ // there is support for data alignment, LEGAL_HACK can be removed.

#define LEGAL_HACK(Vect) legalizeToVar((Vect))

if (InstructionSet >= SSE4_1) {

// TODO(wala): If the condition operand is a constant, use blendps

@@ -3687,13 +3768,16 @@ void TargetX8632::lowerStore(const InstStore *Inst) {

Operand *Value = Inst->getData();

Operand *Addr = Inst->getAddr();

OperandX8632Mem *NewAddr = FormMemoryOperand(Addr, Value->getType());

+ Type Ty = NewAddr->getType();

- if (NewAddr->getType() == IceType_i64) {

+ if (Ty == IceType_i64) {

Value = legalize(Value);

Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm, true);

Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm, true);

_store(ValueHi, llvm::cast<OperandX8632Mem>(hiOperand(NewAddr)));

_store(ValueLo, llvm::cast<OperandX8632Mem>(loOperand(NewAddr)));

+ } else if (isVectorType(Ty)) {

+ _storep(legalizeToVar(Value), NewAddr);

} else {

Value = legalize(Value, Legal_Reg | Legal_Imm, true);

_store(Value, NewAddr);

@@ -4036,9 +4120,9 @@ void TargetX8632::postLower() {

llvm::SmallBitVector AvailableTypedRegisters =

AvailableRegisters & getRegisterSetForType(Var->getType());

if (!AvailableTypedRegisters.any()) {

- // This is a hack in case we run out of physical registers

- // due to an excessive number of "push" instructions from

- // lowering a call.

+ // This is a hack in case we run out of physical registers due

+ // to an excessively long code sequence, as might happen when

+ // lowering arguments in lowerCall().

AvailableRegisters = WhiteList;

AvailableTypedRegisters =

AvailableRegisters & getRegisterSetForType(Var->getType());

« crosstest/test_calling_conv.h ('K') | « src/IceTargetLoweringX8632.h ('k') | tests_lit/llvm2ice_tests/64bit.pnacl.ll » ('j') | no next file with comments »