src/IceTargetLoweringARM32.cpp - Issue 1465213002: Subzero. ARM32. Combine allocas.

Unified Diff: src/IceTargetLoweringARM32.cpp

Issue 1465213002: Subzero. ARM32. Combine allocas. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master

Patch Set: Addresses comments. Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/IceTargetLoweringARM32.cpp

diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp

index 12810f6c88656208ead7115fa0021f4915a5ece5..f23609b7837cce25402e4d1382ffa9741273d02c 100644

--- a/src/IceTargetLoweringARM32.cpp

+++ b/src/IceTargetLoweringARM32.cpp

@@ -265,7 +265,7 @@ uint32_t TargetARM32::getCallStackArgumentsSizeBytes(const InstCall *Call) {

}

void TargetARM32::findMaxStackOutArgsSize() {

- // MinNeededOutArgsBytes should be updated if the Target ever creates an

+ // MinNeededOutArgsBytes should be updated if the Target ever creates a

// high-level InstCall that requires more stack bytes.

constexpr size_t MinNeededOutArgsBytes = 0;

MaxOutArgsSizeBytes = MinNeededOutArgsBytes;

@@ -291,7 +291,7 @@ void TargetARM32::translateO2() {

findMaxStackOutArgsSize();

// Do not merge Alloca instructions, and lay out the stack.

- static constexpr bool SortAndCombineAllocas = false;

+ static constexpr bool SortAndCombineAllocas = true;

Func->processAllocas(SortAndCombineAllocas);

Func->dump("After Alloca processing");

@@ -356,6 +356,7 @@ void TargetARM32::translateO2() {

regAlloc(RAK_Global);

if (Func->hasError())

return;

copyRegAllocFromInfWeightVariable64On32(Func->getVariables());

Func->dump("After linear scan regalloc");

@@ -364,6 +365,8 @@ void TargetARM32::translateO2() {

Func->dump("After advanced Phi lowering");

}

+ ForbidTemporaryWithoutReg _(this);

// Stack frame mapping.

Func->genFrame();

if (Func->hasError())

@@ -399,8 +402,8 @@ void TargetARM32::translateOm1() {

findMaxStackOutArgsSize();

// Do not merge Alloca instructions, and lay out the stack.

- static constexpr bool SortAndCombineAllocas = false;

- Func->processAllocas(SortAndCombineAllocas);

+ static constexpr bool DontSortAndCombineAllocas = false;

+ Func->processAllocas(DontSortAndCombineAllocas);

Func->dump("After Alloca processing");

Func->placePhiLoads();

@@ -424,9 +427,12 @@ void TargetARM32::translateOm1() {

regAlloc(RAK_InfOnly);

if (Func->hasError())

return;

copyRegAllocFromInfWeightVariable64On32(Func->getVariables());

Func->dump("After regalloc of infinite-weight variables");

+ ForbidTemporaryWithoutReg _(this);

Func->genFrame();

if (Func->hasError())

return;

@@ -520,6 +526,7 @@ void TargetARM32::emitVariable(const Variable *Var) const {

llvm::report_fatal_error(

"Infinite-weight Variable has no register assigned");

}

+ assert(!Var->isRematerializable());

int32_t Offset = Var->getStackOffset();

int32_t BaseRegNum = Var->getBaseRegNum();

if (BaseRegNum == Variable::NoRegister) {

@@ -850,6 +857,9 @@ void TargetARM32::addProlog(CfgNode *Node) {

SpillAreaSizeBytes = StackSize - StackOffset;

}

+ // Combine fixed alloca with SpillAreaSize.

+ SpillAreaSizeBytes += FixedAllocaSizeBytes;

// Generate "sub sp, SpillAreaSizeBytes"

if (SpillAreaSizeBytes) {

// Use the scratch register if needed to legalize the immediate.

@@ -857,7 +867,11 @@ void TargetARM32::addProlog(CfgNode *Node) {

Legal_Reg | Legal_Flex, getReservedTmpReg());

Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);

_sub(SP, SP, SubAmount);

+ if (FixedAllocaAlignBytes > ARM32_STACK_ALIGNMENT_BYTES) {

+ alignRegisterPow2(SP, FixedAllocaAlignBytes);

+ }

}

Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);

// Fill in stack offsets for stack args, and copy args into registers for

@@ -1034,6 +1048,7 @@ OperandARM32Mem *TargetARM32::createMemOperand(Type Ty, int32_t Offset,

Variable *OrigBaseReg,

Variable **NewBaseReg,

int32_t *NewBaseOffset) {

+ assert(!OrigBaseReg->isRematerializable());

if (isLegalMemOffset(Ty, Offset)) {

return OperandARM32Mem::create(

Func, Ty, OrigBaseReg,

@@ -1053,6 +1068,7 @@ OperandARM32Mem *TargetARM32::createMemOperand(Type Ty, int32_t Offset,

OffsetDiff = 0;

}

+ assert(!(*NewBaseReg)->isRematerializable());

return OperandARM32Mem::create(

Func, Ty, *NewBaseReg,

llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(OffsetDiff)),

@@ -1076,8 +1092,9 @@ void TargetARM32::legalizeMov(InstARM32Mov *MovInstr, Variable *OrigBaseReg,

bool Legalized = false;

if (!Dest->hasReg()) {

- auto *const SrcR = llvm::cast<Variable>(Src);

+ auto *SrcR = llvm::cast<Variable>(Src);

assert(SrcR->hasReg());

+ assert(!SrcR->isRematerializable());

const int32_t Offset = Dest->getStackOffset();

// This is a _mov(Mem(), Variable), i.e., a store.

_str(SrcR, createMemOperand(DestTy, Offset, OrigBaseReg, NewBaseReg,

@@ -1087,12 +1104,26 @@ void TargetARM32::legalizeMov(InstARM32Mov *MovInstr, Variable *OrigBaseReg,

Context.insert(InstFakeDef::create(Func, Dest));

Legalized = true;

} else if (auto *Var = llvm::dyn_cast<Variable>(Src)) {

- if (!Var->hasReg()) {

- const int32_t Offset = Var->getStackOffset();

- _ldr(Dest, createMemOperand(DestTy, Offset, OrigBaseReg, NewBaseReg,

- NewBaseOffset),

- MovInstr->getPredicate());

+ if (Var->isRematerializable()) {

+ // Rematerialization arithmetic.

+ const int32_t ExtraOffset =

+ (static_cast<SizeT>(Var->getRegNum()) == getFrameReg())

+ ? getFrameFixedAllocaOffset()

+ : 0;

+ const int32_t Offset = Var->getStackOffset() + ExtraOffset;

+ Operand *OffsetRF = legalize(Ctx->getConstantInt32(Offset),

+ Legal_Reg | Legal_Flex, Dest->getRegNum());

+ _add(Dest, Var, OffsetRF);

Legalized = true;

+ } else {

+ if (!Var->hasReg()) {

+ const int32_t Offset = Var->getStackOffset();

+ _ldr(Dest, createMemOperand(DestTy, Offset, OrigBaseReg, NewBaseReg,

+ NewBaseOffset),

+ MovInstr->getPredicate());

+ Legalized = true;

+ }

}

@@ -1163,13 +1194,15 @@ Operand *TargetARM32::loOperand(Operand *Operand) {

// increment) in case of duplication.

assert(Mem->getAddrMode() == OperandARM32Mem::Offset ||

Mem->getAddrMode() == OperandARM32Mem::NegOffset);

+ Variable *BaseR = legalizeToReg(Mem->getBase());

if (Mem->isRegReg()) {

- return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(),

- Mem->getIndex(), Mem->getShiftOp(),

- Mem->getShiftAmt(), Mem->getAddrMode());

+ Variable *IndexR = legalizeToReg(Mem->getIndex());

+ return OperandARM32Mem::create(Func, IceType_i32, BaseR, IndexR,

+ Mem->getShiftOp(), Mem->getShiftAmt(),

+ Mem->getAddrMode());

} else {

- return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(),

- Mem->getOffset(), Mem->getAddrMode());

+ return OperandARM32Mem::create(Func, IceType_i32, BaseR, Mem->getOffset(),

+ Mem->getAddrMode());

}

llvm_unreachable("Unsupported operand type");

@@ -1201,7 +1234,9 @@ Operand *TargetARM32::hiOperand(Operand *Operand) {

Variable *NewBase = Func->makeVariable(Base->getType());

lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add, NewBase,

Base, Four));

- return OperandARM32Mem::create(Func, SplitType, NewBase, Mem->getIndex(),

+ Variable *BaseR = legalizeToReg(NewBase);

+ Variable *IndexR = legalizeToReg(Mem->getIndex());

+ return OperandARM32Mem::create(Func, SplitType, BaseR, IndexR,

Mem->getShiftOp(), Mem->getShiftAmt(),

Mem->getAddrMode());

} else {

@@ -1216,16 +1251,17 @@ Operand *TargetARM32::hiOperand(Operand *Operand) {

// mode into a RegReg addressing mode. Since NaCl sandboxing disallows

// RegReg addressing modes, prefer adding to base and replacing

// instead. Thus we leave the old offset alone.

- Constant *Four = Ctx->getConstantInt32(4);

+ Constant *_4 = Ctx->getConstantInt32(4);

Variable *NewBase = Func->makeVariable(Base->getType());

lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add,

- NewBase, Base, Four));

+ NewBase, Base, _4));

Base = NewBase;

} else {

Offset =

llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(NextOffsetVal));

}

- return OperandARM32Mem::create(Func, SplitType, Base, Offset,

+ Variable *BaseR = legalizeToReg(Base);

+ return OperandARM32Mem::create(Func, SplitType, BaseR, Offset,

Mem->getAddrMode());

}

@@ -1264,7 +1300,6 @@ llvm::SmallBitVector TargetARM32::getRegisterSet(RegSetMask Include,

}

void TargetARM32::lowerAlloca(const InstAlloca *Inst) {

- UsesFramePointer = true;

// Conservatively require the stack to be aligned. Some stack adjustment

// operations implemented below assume that the stack is aligned before the

// alloca. All the alloca code ensures that the stack alignment is preserved

@@ -1272,29 +1307,53 @@ void TargetARM32::lowerAlloca(const InstAlloca *Inst) {

// cases.

NeedsStackAlignment = true;

- // TODO(stichnot): minimize the number of adjustments of SP, etc.

- Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);

- Variable *Dest = Inst->getDest();

- uint32_t AlignmentParam = Inst->getAlignInBytes();

// For default align=0, set it to the real value 1, to avoid any

// bit-manipulation problems below.

- AlignmentParam = std::max(AlignmentParam, 1u);

+ const uint32_t AlignmentParam = std::max(1u, Inst->getAlignInBytes());

// LLVM enforces power of 2 alignment.

assert(llvm::isPowerOf2_32(AlignmentParam));

assert(llvm::isPowerOf2_32(ARM32_STACK_ALIGNMENT_BYTES));

- uint32_t Alignment = std::max(AlignmentParam, ARM32_STACK_ALIGNMENT_BYTES);

- if (Alignment > ARM32_STACK_ALIGNMENT_BYTES) {

+ const uint32_t Alignment =

+ std::max(AlignmentParam, ARM32_STACK_ALIGNMENT_BYTES);

+ const bool OverAligned = Alignment > ARM32_STACK_ALIGNMENT_BYTES;

+ const bool OptM1 = Ctx->getFlags().getOptLevel() == Opt_m1;

+ const bool AllocaWithKnownOffset = Inst->getKnownFrameOffset();

+ const bool UseFramePointer =

+ hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;

+ if (UseFramePointer)

+ setHasFramePointer();

+ Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);

+ if (OverAligned) {

alignRegisterPow2(SP, Alignment);

}

+ Variable *Dest = Inst->getDest();

Operand *TotalSize = Inst->getSizeInBytes();

if (const auto *ConstantTotalSize =

llvm::dyn_cast<ConstantInteger32>(TotalSize)) {

- uint32_t Value = ConstantTotalSize->getValue();

- Value = Utils::applyAlignment(Value, Alignment);

- Operand *SubAmount = legalize(Ctx->getConstantInt32(Value));

- _sub(SP, SP, SubAmount);

+ const uint32_t Value =

+ Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);

+ // Constant size alloca.

+ if (!UseFramePointer) {

+ // If we don't need a Frame Pointer, this alloca has a known offset to the

+ // stack pointer. We don't need adjust the stack pointer, nor assign any

+ // value to Dest, as Dest is rematerializable.

+ assert(Dest->isRematerializable());

+ FixedAllocaSizeBytes += Value;

+ Context.insert(InstFakeDef::create(Func, Dest));

+ return;

+ }

+ // If a frame pointer is required, then we need to store the alloca'd result

+ // in Dest.

+ Operand *SubAmountRF =

+ legalize(Ctx->getConstantInt32(Value), Legal_Reg | Legal_Flex);

+ _sub(SP, SP, SubAmountRF);

} else {

// Non-constant sizes need to be adjusted to the next highest multiple of

// the required alignment at runtime.

@@ -1306,6 +1365,8 @@ void TargetARM32::lowerAlloca(const InstAlloca *Inst) {

alignRegisterPow2(T, Alignment);

_sub(SP, SP, T);

}

+ // Adds back a few bytes to SP to account for the out args area.

Variable *T = SP;

if (MaxOutArgsSizeBytes != 0) {

T = makeReg(getPointerType());

@@ -1313,6 +1374,7 @@ void TargetARM32::lowerAlloca(const InstAlloca *Inst) {

Ctx->getConstantInt32(MaxOutArgsSizeBytes), Legal_Reg | Legal_Flex);

_add(T, SP, OutArgsSizeRF);

}

_mov(Dest, T);

}

@@ -1976,6 +2038,12 @@ void TargetARM32::lowerInt64Arithmetic(InstArithmetic::OpKind Op,

void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {

Variable *Dest = Inst->getDest();

+ if (Dest->isRematerializable()) {

+ Context.insert(InstFakeDef::create(Func, Dest));

+ return;

+ }

if (Dest->getType() == IceType_i1) {

lowerInt1Arithmetic(Inst);

return;

@@ -2139,8 +2207,8 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {

if (Srcs.hasConstOperand()) {

// TODO(jpp): lowering Src0R here is wrong -- Src0R it is not guaranteed

// to be used.

- Variable *Src0R = Srcs.src0R(this);

if (Srcs.immediateIsFlexEncodable()) {

+ Variable *Src0R = Srcs.src0R(this);

Operand *Src1RF = Srcs.src1RF(this);

if (Srcs.swappedOperands()) {

_rsb(T, Src0R, Src1RF);

@@ -2151,6 +2219,7 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {

return;

}

if (!Srcs.swappedOperands() && Srcs.negatedImmediateIsFlexEncodable()) {

+ Variable *Src0R = Srcs.src0R(this);

Operand *Src1F = Srcs.negatedSrc1F(this);

_add(T, Src0R, Src1F);

_mov(Dest, T);

@@ -2215,6 +2284,12 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {

void TargetARM32::lowerAssign(const InstAssign *Inst) {

Variable *Dest = Inst->getDest();

+ if (Dest->isRematerializable()) {

+ Context.insert(InstFakeDef::create(Func, Dest));

+ return;

+ }

Operand *Src0 = Inst->getSrc(0);

assert(Dest->getType() == Src0->getType());

if (Dest->getType() == IceType_i64) {

@@ -4425,13 +4500,17 @@ OperandARM32Mem *TargetARM32::formAddressingMode(Type Ty, Cfg *Func,

assert(OffsetImm < 0 ? (ValidImmMask & -OffsetImm) == -OffsetImm

: (ValidImmMask & OffsetImm) == OffsetImm);

+ Variable *BaseR = makeReg(getPointerType());

+ Context.insert(InstAssign::create(Func, BaseR, BaseVar));

if (OffsetReg != nullptr) {

- return OperandARM32Mem::create(Func, Ty, BaseVar, OffsetReg, ShiftKind,

+ Variable *OffsetR = makeReg(getPointerType());

+ Context.insert(InstAssign::create(Func, OffsetR, OffsetReg));

+ return OperandARM32Mem::create(Func, Ty, BaseR, OffsetR, ShiftKind,

OffsetRegShamt);

}

return OperandARM32Mem::create(

- Func, Ty, BaseVar,

+ Func, Ty, BaseR,

llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(OffsetImm)));

}

@@ -4630,7 +4709,8 @@ Operand *TargetARM32::legalize(Operand *From, LegalMask Allowed,

if (RegNum == Variable::NoRegister) {

if (Variable *Subst = getContext().availabilityGet(From)) {

// At this point we know there is a potential substitution available.

- if (Subst->mustHaveReg() && !Subst->hasReg()) {

+ if (!Subst->isRematerializable() && Subst->mustHaveReg() &&

+ !Subst->hasReg()) {

// At this point we know the substitution will have a register.

if (From->getType() == Subst->getType()) {

// At this point we know the substitution's register is compatible.

@@ -4788,6 +4868,13 @@ Operand *TargetARM32::legalize(Operand *From, LegalMask Allowed,

}

if (auto *Var = llvm::dyn_cast<Variable>(From)) {

+ if (Var->isRematerializable()) {

+ // TODO(jpp): We don't need to rematerialize Var if legalize() was invoked

+ // for a Variable in a Mem operand.

+ Variable *T = makeReg(Var->getType(), RegNum);

+ _mov(T, Var);

+ return T;

+ }

// Check if the variable is guaranteed a physical register. This can happen

// either when the variable is pre-colored or when it is assigned infinite

// weight.

@@ -4844,9 +4931,9 @@ OperandARM32Mem *TargetARM32::formMemoryOperand(Operand *Operand, Type Ty) {

// If we didn't do address mode optimization, then we only have a

// base/offset to work with. ARM always requires a base register, so

// just use that to hold the operand.

- Variable *Base = legalizeToReg(Operand);

+ Variable *BaseR = legalizeToReg(Operand);

return OperandARM32Mem::create(

- Func, Ty, Base,

+ Func, Ty, BaseR,

llvm::cast<ConstantInteger32>(Ctx->getConstantZero(IceType_i32)));

}

@@ -4863,6 +4950,7 @@ Variable64On32 *TargetARM32::makeI64RegPair() {

Variable *TargetARM32::makeReg(Type Type, int32_t RegNum) {

// There aren't any 64-bit integer registers for ARM32.

assert(Type != IceType_i64);

+ assert(AllowTemporaryWithNoReg || RegNum != Variable::NoRegister);

Variable *Reg = Func->makeVariable(Type);

if (RegNum == Variable::NoRegister)

Reg->setMustHaveReg();

@@ -4871,7 +4959,8 @@ Variable *TargetARM32::makeReg(Type Type, int32_t RegNum) {

return Reg;

}

-void TargetARM32::alignRegisterPow2(Variable *Reg, uint32_t Align) {

+void TargetARM32::alignRegisterPow2(Variable *Reg, uint32_t Align,

+ int32_t TmpRegNum) {

assert(llvm::isPowerOf2_32(Align));

uint32_t RotateAmt;

uint32_t Immed_8;

@@ -4880,10 +4969,12 @@ void TargetARM32::alignRegisterPow2(Variable *Reg, uint32_t Align) {

// it fits at all). Assume Align is usually small, in which case BIC works

// better. Thus, this rounds down to the alignment.

if (OperandARM32FlexImm::canHoldImm(Align - 1, &RotateAmt, &Immed_8)) {

- Mask = legalize(Ctx->getConstantInt32(Align - 1), Legal_Reg | Legal_Flex);

+ Mask = legalize(Ctx->getConstantInt32(Align - 1), Legal_Reg | Legal_Flex,

+ TmpRegNum);

_bic(Reg, Reg, Mask);

} else {

- Mask = legalize(Ctx->getConstantInt32(-Align), Legal_Reg | Legal_Flex);

+ Mask = legalize(Ctx->getConstantInt32(-Align), Legal_Reg | Legal_Flex,

+ TmpRegNum);

_and(Reg, Reg, Mask);

}

« no previous file with comments | « src/IceTargetLoweringARM32.h ('k') | tests_lit/assembler/arm32/bic.ll » ('j') | no next file with comments »