src/IceTargetLoweringARM32.cpp - Issue 1467473003: Subzero. ARM32. No more SP frobbing.

Unified Diff: src/IceTargetLoweringARM32.cpp

Issue 1467473003: Subzero. ARM32. No more SP frobbing. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master

Patch Set: Fixes the Offsetis typo. Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/IceTargetLoweringARM32.cpp

diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp

index 60d3a37feb1cd6d546f547f067cff09b6994cf99..12810f6c88656208ead7115fa0021f4915a5ece5 100644

--- a/src/IceTargetLoweringARM32.cpp

+++ b/src/IceTargetLoweringARM32.cpp

@@ -233,12 +233,62 @@ void copyRegAllocFromInfWeightVariable64On32(const VarList &Vars) {

}

} // end of anonymous namespace

+uint32_t TargetARM32::getCallStackArgumentsSizeBytes(const InstCall *Call) {

+ TargetARM32::CallingConv CC;

+ size_t OutArgsSizeBytes = 0;

+ for (SizeT i = 0, NumArgs = Call->getNumArgs(); i < NumArgs; ++i) {

+ Operand *Arg = legalizeUndef(Call->getArg(i));

+ Type Ty = Arg->getType();

+ if (Ty == IceType_i64) {

+ std::pair<int32_t, int32_t> Regs;

+ if (CC.I64InRegs(&Regs)) {

+ continue;

+ }

+ } else if (isVectorType(Ty) || isFloatingType(Ty)) {

+ int32_t Reg;

+ if (CC.FPInReg(Ty, &Reg)) {

+ continue;

+ }

+ } else {

+ assert(Ty == IceType_i32);

+ int32_t Reg;

+ if (CC.I32InReg(&Reg)) {

+ continue;

+ }

+ OutArgsSizeBytes = applyStackAlignmentTy(OutArgsSizeBytes, Ty);

+ OutArgsSizeBytes += typeWidthInBytesOnStack(Ty);

+ }

+ return applyStackAlignment(OutArgsSizeBytes);

+void TargetARM32::findMaxStackOutArgsSize() {

+ // MinNeededOutArgsBytes should be updated if the Target ever creates an

Jim Stichnoth 2015/11/22 04:09:43 s/an/a/ at least for the way I pronounce "high"

John 2015/11/23 18:54:10 Done.

+ // high-level InstCall that requires more stack bytes.

+ constexpr size_t MinNeededOutArgsBytes = 0;

+ MaxOutArgsSizeBytes = MinNeededOutArgsBytes;

+ for (CfgNode *Node : Func->getNodes()) {

+ Context.init(Node);

+ while (!Context.atEnd()) {

+ PostIncrLoweringContext PostIncrement(Context);

+ Inst *CurInstr = Context.getCur();

+ if (auto *Call = llvm::dyn_cast<InstCall>(CurInstr)) {

+ SizeT OutArgsSizeBytes = getCallStackArgumentsSizeBytes(Call);

+ MaxOutArgsSizeBytes = std::max(MaxOutArgsSizeBytes, OutArgsSizeBytes);

+ }

void TargetARM32::translateO2() {

TimerMarker T(TimerStack::TT_O2, Func);

// TODO(stichnot): share passes with X86?

// https://code.google.com/p/nativeclient/issues/detail?id=4094

genTargetHelperCalls();

+ findMaxStackOutArgsSize();

// Do not merge Alloca instructions, and lay out the stack.

static constexpr bool SortAndCombineAllocas = false;

@@ -346,6 +396,7 @@ void TargetARM32::translateOm1() {

// TODO: share passes with X86?

genTargetHelperCalls();

+ findMaxStackOutArgsSize();

// Do not merge Alloca instructions, and lay out the stack.

static constexpr bool SortAndCombineAllocas = false;

@@ -473,8 +524,6 @@ void TargetARM32::emitVariable(const Variable *Var) const {

int32_t BaseRegNum = Var->getBaseRegNum();

if (BaseRegNum == Variable::NoRegister) {

BaseRegNum = getFrameOrStackReg();

- if (!hasFramePointer())

- Offset += getStackAdjustment();

}

const Type VarTy = Var->getType();

Str << "[" << getRegName(BaseRegNum, VarTy);

@@ -670,7 +719,11 @@ void TargetARM32::addProlog(CfgNode *Node) {

// +------------------------+

// | 6. padding |

// +------------------------+

- // | 7. allocas |

+ // | 7. allocas (variable) |

+ // +------------------------+

+ // | 8. padding |

+ // +------------------------+

+ // | 9. out args |

// +------------------------+ <--- StackPointer

// The following variables record the size in bytes of the given areas:

@@ -679,7 +732,9 @@ void TargetARM32::addProlog(CfgNode *Node) {

// * GlobalsSize: area 3

// * GlobalsAndSubsequentPaddingSize: areas 3 - 4

// * LocalsSpillAreaSize: area 5

- // * SpillAreaSizeBytes: areas 2 - 6

+ // * SpillAreaSizeBytes: areas 2 - 6, and 9

+ // * MaxOutArgsSizeBytes: area 9

+ //

// Determine stack frame offsets for each Variable without a register

// assignment. This can be done as one variable per stack slot. Or, do

// coalescing by running the register allocator again with an infinite set of

@@ -785,10 +840,13 @@ void TargetARM32::addProlog(CfgNode *Node) {

uint32_t GlobalsAndSubsequentPaddingSize =

GlobalsSize + LocalsSlotsPaddingBytes;

- // Align SP if necessary.

- if (NeedsStackAlignment) {

+ // Adds the out args space to the stack, and align SP if necessary.

+ if (!NeedsStackAlignment) {

+ SpillAreaSizeBytes += MaxOutArgsSizeBytes;

+ } else {

uint32_t StackOffset = PreservedRegsSizeBytes;

uint32_t StackSize = applyStackAlignment(StackOffset + SpillAreaSizeBytes);

+ StackSize = applyStackAlignment(StackSize + MaxOutArgsSizeBytes);

SpillAreaSizeBytes = StackSize - StackOffset;

}

@@ -802,8 +860,6 @@ void TargetARM32::addProlog(CfgNode *Node) {

}

Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);

- resetStackAdjustment();

// Fill in stack offsets for stack args, and copy args into registers for

// those that were register-allocated. Args are pushed right to left, so

// Arg[0] is closest to the stack/frame pointer.

@@ -847,7 +903,8 @@ void TargetARM32::addProlog(CfgNode *Node) {

Str << "Stack layout:\n";

uint32_t SPAdjustmentPaddingSize =

SpillAreaSizeBytes - LocalsSpillAreaSize -

- GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes;

+ GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -

+ MaxOutArgsSizeBytes;

Str << " in-args = " << InArgsSizeBytes << " bytes\n"

<< " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"

<< " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"

@@ -860,6 +917,7 @@ void TargetARM32::addProlog(CfgNode *Node) {

Str << "Stack details:\n"

<< " SP adjustment = " << SpillAreaSizeBytes << " bytes\n"

<< " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"

+ << " outgoing args size = " << MaxOutArgsSizeBytes << " bytes\n"

<< " locals spill area alignment = " << LocalsSlotsAlignmentBytes

<< " bytes\n"

<< " is FP based = " << UsesFramePointer << "\n";

@@ -956,10 +1014,7 @@ bool TargetARM32::isLegalMemOffset(Type Ty, int32_t Offset) const {

return OperandARM32Mem::canHoldOffset(Ty, ZeroExt, Offset);

}

-Variable *TargetARM32::newBaseRegister(int32_t OriginalOffset,

- int32_t StackAdjust,

- Variable *OrigBaseReg) {

- int32_t Offset = OriginalOffset + StackAdjust;

+Variable *TargetARM32::newBaseRegister(int32_t Offset, Variable *OrigBaseReg) {

// Legalize will likely need a movw/movt combination, but if the top bits are

// all 0 from negating the offset and subtracting, we could use that instead.

bool ShouldSub = (-Offset & 0xFFFF0000) == 0;

@@ -976,26 +1031,25 @@ Variable *TargetARM32::newBaseRegister(int32_t OriginalOffset,

}

OperandARM32Mem *TargetARM32::createMemOperand(Type Ty, int32_t Offset,

- int32_t StackAdjust,

Variable *OrigBaseReg,

Variable **NewBaseReg,

int32_t *NewBaseOffset) {

- if (isLegalMemOffset(Ty, Offset + StackAdjust)) {

+ if (isLegalMemOffset(Ty, Offset)) {

return OperandARM32Mem::create(

- Func, Ty, OrigBaseReg, llvm::cast<ConstantInteger32>(

- Ctx->getConstantInt32(Offset + StackAdjust)),

+ Func, Ty, OrigBaseReg,

+ llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(Offset)),

OperandARM32Mem::Offset);

}

if (*NewBaseReg == nullptr) {

- *NewBaseReg = newBaseRegister(Offset, StackAdjust, OrigBaseReg);

- *NewBaseOffset = Offset + StackAdjust;

+ *NewBaseReg = newBaseRegister(Offset, OrigBaseReg);

+ *NewBaseOffset = Offset;

}

- int32_t OffsetDiff = Offset + StackAdjust - *NewBaseOffset;

+ int32_t OffsetDiff = Offset - *NewBaseOffset;

if (!isLegalMemOffset(Ty, OffsetDiff)) {

- *NewBaseReg = newBaseRegister(Offset, StackAdjust, OrigBaseReg);

- *NewBaseOffset = Offset + StackAdjust;

+ *NewBaseReg = newBaseRegister(Offset, OrigBaseReg);

+ *NewBaseOffset = Offset;

OffsetDiff = 0;

}

@@ -1005,9 +1059,8 @@ OperandARM32Mem *TargetARM32::createMemOperand(Type Ty, int32_t Offset,

OperandARM32Mem::Offset);

}

-void TargetARM32::legalizeMov(InstARM32Mov *MovInstr, int32_t StackAdjust,

- Variable *OrigBaseReg, Variable **NewBaseReg,

- int32_t *NewBaseOffset) {

+void TargetARM32::legalizeMov(InstARM32Mov *MovInstr, Variable *OrigBaseReg,

+ Variable **NewBaseReg, int32_t *NewBaseOffset) {

Variable *Dest = MovInstr->getDest();

assert(Dest != nullptr);

Type DestTy = Dest->getType();

@@ -1027,8 +1080,8 @@ void TargetARM32::legalizeMov(InstARM32Mov *MovInstr, int32_t StackAdjust,

assert(SrcR->hasReg());

const int32_t Offset = Dest->getStackOffset();

// This is a _mov(Mem(), Variable), i.e., a store.

- _str(SrcR, createMemOperand(DestTy, Offset, StackAdjust, OrigBaseReg,

- NewBaseReg, NewBaseOffset),

+ _str(SrcR, createMemOperand(DestTy, Offset, OrigBaseReg, NewBaseReg,

+ NewBaseOffset),

MovInstr->getPredicate());

// _str() does not have a Dest, so we add a fake-def(Dest).

Context.insert(InstFakeDef::create(Func, Dest));

@@ -1036,8 +1089,8 @@ void TargetARM32::legalizeMov(InstARM32Mov *MovInstr, int32_t StackAdjust,

} else if (auto *Var = llvm::dyn_cast<Variable>(Src)) {

if (!Var->hasReg()) {

const int32_t Offset = Var->getStackOffset();

- _ldr(Dest, createMemOperand(DestTy, Offset, StackAdjust, OrigBaseReg,

- NewBaseReg, NewBaseOffset),

+ _ldr(Dest, createMemOperand(DestTy, Offset, OrigBaseReg, NewBaseReg,

+ NewBaseOffset),

MovInstr->getPredicate());

Legalized = true;

}

@@ -1064,7 +1117,6 @@ void TargetARM32::legalizeStackSlots() {

Func->dump("Before legalizeStackSlots");

assert(hasComputedFrame());

Variable *OrigBaseReg = getPhysicalRegister(getFrameOrStackReg());

- int32_t StackAdjust = 0;

// Do a fairly naive greedy clustering for now. Pick the first stack slot

// that's out of bounds and make a new base reg using the architecture's temp

// register. If that works for the next slot, then great. Otherwise, create a

@@ -1091,23 +1143,8 @@ void TargetARM32::legalizeStackSlots() {

NewBaseOffset = 0;

}

- // The stack adjustment only matters if we are using SP instead of FP.

- if (!hasFramePointer()) {

- if (auto *AdjInst = llvm::dyn_cast<InstARM32AdjustStack>(CurInstr)) {

- StackAdjust += AdjInst->getAmount();

- NewBaseOffset += AdjInst->getAmount();

- continue;

- }

- if (llvm::isa<InstARM32Call>(CurInstr)) {

- NewBaseOffset -= StackAdjust;

- StackAdjust = 0;

- continue;

- }

if (auto *MovInstr = llvm::dyn_cast<InstARM32Mov>(CurInstr)) {

- legalizeMov(MovInstr, StackAdjust, OrigBaseReg, &NewBaseReg,

- &NewBaseOffset);

+ legalizeMov(MovInstr, OrigBaseReg, &NewBaseReg, &NewBaseOffset);

}

@@ -1269,7 +1306,14 @@ void TargetARM32::lowerAlloca(const InstAlloca *Inst) {

alignRegisterPow2(T, Alignment);

_sub(SP, SP, T);

}

- _mov(Dest, SP);

+ Variable *T = SP;

+ if (MaxOutArgsSizeBytes != 0) {

+ T = makeReg(getPointerType());

+ Operand *OutArgsSizeRF = legalize(

+ Ctx->getConstantInt32(MaxOutArgsSizeBytes), Legal_Reg | Legal_Flex);

+ _add(T, SP, OutArgsSizeRF);

+ }

+ _mov(Dest, T);

}

void TargetARM32::div0Check(Type Ty, Operand *SrcLo, Operand *SrcHi) {

@@ -2093,6 +2137,8 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {

}

case InstArithmetic::Sub: {

if (Srcs.hasConstOperand()) {

+ // TODO(jpp): lowering Src0R here is wrong -- Src0R it is not guaranteed

+ // to be used.

Variable *Src0R = Srcs.src0R(this);

if (Srcs.immediateIsFlexEncodable()) {

Operand *Src1RF = Srcs.src1RF(this);

@@ -2346,7 +2392,7 @@ void TargetARM32::lowerCall(const InstCall *Instr) {

TargetARM32::CallingConv::ARM32_MAX_FP_REG_UNITS> FPArgs;

// Pair of Arg Operand -> stack offset.

llvm::SmallVector<std::pair<Operand *, int32_t>, 8> StackArgs;

- int32_t ParameterAreaSizeBytes = 0;

+ size_t ParameterAreaSizeBytes = 0;

// Classify each argument operand according to the location where the

// argument is passed.

@@ -2390,16 +2436,8 @@ void TargetARM32::lowerCall(const InstCall *Instr) {

// the stack is already aligned at the start of the calling sequence.

ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);

- // Subtract the appropriate amount for the argument area. This also takes

- // care of setting the stack adjustment during emission.

- //

- // TODO: If for some reason the call instruction gets dead-code eliminated

- // after lowering, we would need to ensure that the pre-call and the

- // post-call esp adjustment get eliminated as well.

- if (ParameterAreaSizeBytes) {

- Operand *SubAmount = legalize(Ctx->getConstantInt32(ParameterAreaSizeBytes),

- Legal_Reg | Legal_Flex);

- _adjust_stack(ParameterAreaSizeBytes, SubAmount);

+ if (ParameterAreaSizeBytes > MaxOutArgsSizeBytes) {

+ llvm::report_fatal_error("MaxOutArgsSizeBytes is not really a max.");

}

// Copy arguments that are passed on the stack to the appropriate stack

@@ -2492,15 +2530,6 @@ void TargetARM32::lowerCall(const InstCall *Instr) {

if (ReturnRegHi)

Context.insert(InstFakeDef::create(Func, ReturnRegHi));

- // Add the appropriate offset to SP. The call instruction takes care of

- // resetting the stack offset during emission.

- if (ParameterAreaSizeBytes) {

- Operand *AddAmount = legalize(Ctx->getConstantInt32(ParameterAreaSizeBytes),

- Legal_Reg | Legal_Flex);

- Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);

- _add(SP, SP, AddAmount);

- }

// Insert a register-kill pseudo instruction.

Context.insert(InstFakeKill::create(Func, NewCall));

« src/IceTargetLoweringARM32.h ('K') | « src/IceTargetLoweringARM32.h ('k') | src/IceTargetLoweringMIPS32.h » ('j') | no next file with comments »