| Index: src/IceTargetLoweringARM32.cpp
|
| diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
|
| index 2305a1b0c50107f439f6a7e3e616269bf6d44742..af235881642808b4a6fa7434cbd5fd5149b5de46 100644
|
| --- a/src/IceTargetLoweringARM32.cpp
|
| +++ b/src/IceTargetLoweringARM32.cpp
|
| @@ -126,10 +126,17 @@ const uint32_t ARM32_MAX_GPR_ARG = 4;
|
| // Stack alignment
|
| const uint32_t ARM32_STACK_ALIGNMENT_BYTES = 16;
|
|
|
| +// Value is in bytes. Return Value adjusted to the next highest multiple
|
| +// of the stack alignment.
|
| +uint32_t applyStackAlignment(uint32_t Value) {
|
| + return Utils::applyAlignment(Value, ARM32_STACK_ALIGNMENT_BYTES);
|
| +}
|
| +
|
| } // end of anonymous namespace
|
|
|
| TargetARM32::TargetARM32(Cfg *Func)
|
| - : TargetLowering(Func), UsesFramePointer(false) {
|
| + : TargetLowering(Func), UsesFramePointer(false), NeedsStackAlignment(false),
|
| + IsLeafFunction(true), SpillAreaSizeBytes(0) {
|
| // TODO: Don't initialize IntegerRegisters and friends every time.
|
| // Instead, initialize in some sort of static initializer for the
|
| // class.
|
| @@ -396,21 +403,21 @@ void TargetARM32::lowerArguments() {
|
| } else if (Ty == IceType_i64) {
|
| if (NumGPRRegsUsed >= ARM32_MAX_GPR_ARG)
|
| continue;
|
| - int32_t RegLo = RegARM32::Reg_r0 + NumGPRRegsUsed;
|
| - int32_t RegHi = 0;
|
| - ++NumGPRRegsUsed;
|
| + int32_t RegLo;
|
| + int32_t RegHi;
|
| // Always start i64 registers at an even register, so this may end
|
| // up padding away a register.
|
| - if (RegLo % 2 != 0) {
|
| - ++RegLo;
|
| + if (NumGPRRegsUsed % 2 != 0) {
|
| ++NumGPRRegsUsed;
|
| }
|
| - // If this leaves us without room to consume another register,
|
| - // leave any previously speculatively consumed registers as consumed.
|
| - if (NumGPRRegsUsed >= ARM32_MAX_GPR_ARG)
|
| - continue;
|
| + RegLo = RegARM32::Reg_r0 + NumGPRRegsUsed;
|
| + ++NumGPRRegsUsed;
|
| RegHi = RegARM32::Reg_r0 + NumGPRRegsUsed;
|
| ++NumGPRRegsUsed;
|
| + // If this bumps us past the boundary, don't allocate to a register
|
| + // and leave any previously speculatively consumed registers as consumed.
|
| + if (NumGPRRegsUsed > ARM32_MAX_GPR_ARG)
|
| + continue;
|
| Variable *RegisterArg = Func->makeVariable(Ty);
|
| Variable *RegisterLo = Func->makeVariable(IceType_i32);
|
| Variable *RegisterHi = Func->makeVariable(IceType_i32);
|
| @@ -450,16 +457,352 @@ void TargetARM32::lowerArguments() {
|
| }
|
| }
|
|
|
| +// Helper function for addProlog().
|
| +//
|
| +// This assumes Arg is an argument passed on the stack. This sets the
|
| +// frame offset for Arg and updates InArgsSizeBytes according to Arg's
|
| +// width. For an I64 arg that has been split into Lo and Hi components,
|
| +// it calls itself recursively on the components, taking care to handle
|
| +// Lo first because of the little-endian architecture. Lastly, this
|
| +// function generates an instruction to copy Arg into its assigned
|
| +// register if applicable.
|
| +void TargetARM32::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
|
| + size_t BasicFrameOffset,
|
| + size_t &InArgsSizeBytes) {
|
| + Variable *Lo = Arg->getLo();
|
| + Variable *Hi = Arg->getHi();
|
| + Type Ty = Arg->getType();
|
| + if (Lo && Hi && Ty == IceType_i64) {
|
| + assert(Lo->getType() != IceType_i64); // don't want infinite recursion
|
| + assert(Hi->getType() != IceType_i64); // don't want infinite recursion
|
| + finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes);
|
| + finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes);
|
| + return;
|
| + }
|
| + if (isVectorType(Ty)) {
|
| + InArgsSizeBytes = applyStackAlignment(InArgsSizeBytes);
|
| + }
|
| + Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
|
| + InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
|
| + // If the argument variable has been assigned a register, we need to load
|
| + // the value from the stack slot.
|
| + if (Arg->hasReg()) {
|
| + assert(Ty != IceType_i64);
|
| + OperandARM32Mem *Mem = OperandARM32Mem::create(
|
| + Func, Ty, FramePtr, llvm::cast<ConstantInteger32>(
|
| + Ctx->getConstantInt32(Arg->getStackOffset())));
|
| + if (isVectorType(Arg->getType())) {
|
| + UnimplementedError(Func->getContext()->getFlags());
|
| + } else {
|
| + _ldr(Arg, Mem);
|
| + }
|
| + // This argument-copying instruction uses an explicit
|
| + // OperandARM32Mem operand instead of a Variable, so its
|
| + // fill-from-stack operation has to be tracked separately for
|
| + // statistics.
|
| + Ctx->statsUpdateFills();
|
| + }
|
| +}
|
| +
|
| Type TargetARM32::stackSlotType() { return IceType_i32; }
|
|
|
| void TargetARM32::addProlog(CfgNode *Node) {
|
| - (void)Node;
|
| - UnimplementedError(Func->getContext()->getFlags());
|
| + // Stack frame layout:
|
| + //
|
| + // +------------------------+
|
| + // | 1. preserved registers |
|
| + // +------------------------+
|
| + // | 2. padding |
|
| + // +------------------------+
|
| + // | 3. global spill area |
|
| + // +------------------------+
|
| + // | 4. padding |
|
| + // +------------------------+
|
| + // | 5. local spill area |
|
| + // +------------------------+
|
| + // | 6. padding |
|
| + // +------------------------+
|
| + // | 7. allocas |
|
| + // +------------------------+
|
| + //
|
| + // The following variables record the size in bytes of the given areas:
|
| + // * PreservedRegsSizeBytes: area 1
|
| + // * SpillAreaPaddingBytes: area 2
|
| + // * GlobalsSize: area 3
|
| + // * GlobalsAndSubsequentPaddingSize: areas 3 - 4
|
| + // * LocalsSpillAreaSize: area 5
|
| + // * SpillAreaSizeBytes: areas 2 - 6
|
| + // Determine stack frame offsets for each Variable without a
|
| + // register assignment. This can be done as one variable per stack
|
| + // slot. Or, do coalescing by running the register allocator again
|
| + // with an infinite set of registers (as a side effect, this gives
|
| + // variables a second chance at physical register assignment).
|
| + //
|
| + // A middle ground approach is to leverage sparsity and allocate one
|
| + // block of space on the frame for globals (variables with
|
| + // multi-block lifetime), and one block to share for locals
|
| + // (single-block lifetime).
|
| +
|
| + Context.init(Node);
|
| + Context.setInsertPoint(Context.getCur());
|
| +
|
| + llvm::SmallBitVector CalleeSaves =
|
| + getRegisterSet(RegSet_CalleeSave, RegSet_None);
|
| + RegsUsed = llvm::SmallBitVector(CalleeSaves.size());
|
| + VarList SortedSpilledVariables;
|
| + size_t GlobalsSize = 0;
|
| + // If there is a separate locals area, this represents that area.
|
| + // Otherwise it counts any variable not counted by GlobalsSize.
|
| + SpillAreaSizeBytes = 0;
|
| + // If there is a separate locals area, this specifies the alignment
|
| + // for it.
|
| + uint32_t LocalsSlotsAlignmentBytes = 0;
|
| + // The entire spill locations area gets aligned to largest natural
|
| + // alignment of the variables that have a spill slot.
|
| + uint32_t SpillAreaAlignmentBytes = 0;
|
| + // For now, we don't have target-specific variables that need special
|
| + // treatment (no stack-slot-linked SpillVariable type).
|
| + std::function<bool(Variable *)> TargetVarHook =
|
| + [](Variable *) { return false; };
|
| +
|
| + // Compute the list of spilled variables and bounds for GlobalsSize, etc.
|
| + getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
|
| + &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
|
| + &LocalsSlotsAlignmentBytes, TargetVarHook);
|
| + uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
|
| + SpillAreaSizeBytes += GlobalsSize;
|
| +
|
| + // Add push instructions for preserved registers.
|
| + // On ARM, "push" can push a whole list of GPRs via a bitmask (0-15).
|
| + // Unlike x86, ARM also has callee-saved float/vector registers.
|
| + // The "vpush" instruction can handle a whole list of float/vector
|
| + // registers, but it only handles contiguous sequences of registers
|
| + // by specifying the start and the length.
|
| + VarList GPRsToPreserve;
|
| + GPRsToPreserve.reserve(CalleeSaves.size());
|
| + uint32_t NumCallee = 0;
|
| + size_t PreservedRegsSizeBytes = 0;
|
| + // Consider FP and LR as callee-save / used as needed.
|
| + if (UsesFramePointer) {
|
| + CalleeSaves[RegARM32::Reg_fp] = true;
|
| + assert(RegsUsed[RegARM32::Reg_fp] == false);
|
| + RegsUsed[RegARM32::Reg_fp] = true;
|
| + }
|
| + if (!IsLeafFunction) {
|
| + CalleeSaves[RegARM32::Reg_lr] = true;
|
| + RegsUsed[RegARM32::Reg_lr] = true;
|
| + }
|
| + for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
|
| + if (CalleeSaves[i] && RegsUsed[i]) {
|
| + // TODO(jvoung): do separate vpush for each floating point
|
| + // register segment and += 4, or 8 depending on type.
|
| + ++NumCallee;
|
| + PreservedRegsSizeBytes += 4;
|
| + GPRsToPreserve.push_back(getPhysicalRegister(i));
|
| + }
|
| + }
|
| + Ctx->statsUpdateRegistersSaved(NumCallee);
|
| + if (!GPRsToPreserve.empty())
|
| + _push(GPRsToPreserve);
|
| +
|
| + // Generate "mov FP, SP" if needed.
|
| + if (UsesFramePointer) {
|
| + Variable *FP = getPhysicalRegister(RegARM32::Reg_fp);
|
| + Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
|
| + _mov(FP, SP);
|
| + // Keep FP live for late-stage liveness analysis (e.g. asm-verbose mode).
|
| + Context.insert(InstFakeUse::create(Func, FP));
|
| + }
|
| +
|
| + // TODO(jvoung): Reuse this code too.
|
| + // Align the variables area. SpillAreaPaddingBytes is the size of
|
| + // the region after the preserved registers and before the spill areas.
|
| + uint32_t SpillAreaPaddingBytes = 0;
|
| + if (SpillAreaAlignmentBytes) {
|
| + assert(SpillAreaAlignmentBytes <= ARM32_STACK_ALIGNMENT_BYTES);
|
| + uint32_t PaddingStart = PreservedRegsSizeBytes;
|
| + uint32_t SpillAreaStart =
|
| + Utils::applyAlignment(PaddingStart, SpillAreaAlignmentBytes);
|
| + SpillAreaPaddingBytes = SpillAreaStart - PaddingStart;
|
| + SpillAreaSizeBytes += SpillAreaPaddingBytes;
|
| + }
|
| +
|
| + // If there are separate globals and locals areas, make sure the
|
| + // locals area is aligned by padding the end of the globals area.
|
| + uint32_t GlobalsAndSubsequentPaddingSize = GlobalsSize;
|
| + if (LocalsSlotsAlignmentBytes) {
|
| + assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
|
| + GlobalsAndSubsequentPaddingSize =
|
| + Utils::applyAlignment(GlobalsSize, LocalsSlotsAlignmentBytes);
|
| + SpillAreaSizeBytes += GlobalsAndSubsequentPaddingSize - GlobalsSize;
|
| + }
|
| +
|
| + // Align SP if necessary.
|
| + if (NeedsStackAlignment) {
|
| + uint32_t StackOffset = PreservedRegsSizeBytes;
|
| + uint32_t StackSize = applyStackAlignment(StackOffset + SpillAreaSizeBytes);
|
| + SpillAreaSizeBytes = StackSize - StackOffset;
|
| + }
|
| +
|
| + // Generate "sub sp, SpillAreaSizeBytes"
|
| + if (SpillAreaSizeBytes) {
|
| + // Use the IP inter-procedural scratch register if needed to legalize
|
| + // the immediate.
|
| + Operand *SubAmount = legalize(Ctx->getConstantInt32(SpillAreaSizeBytes),
|
| + Legal_Reg | Legal_Flex, RegARM32::Reg_ip);
|
| + Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
|
| + _sub(SP, SP, SubAmount);
|
| + }
|
| + Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
|
| +
|
| + resetStackAdjustment();
|
| +
|
| + // Fill in stack offsets for stack args, and copy args into registers
|
| + // for those that were register-allocated. Args are pushed right to
|
| + // left, so Arg[0] is closest to the stack/frame pointer.
|
| + Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg());
|
| + size_t BasicFrameOffset = PreservedRegsSizeBytes;
|
| + if (!UsesFramePointer)
|
| + BasicFrameOffset += SpillAreaSizeBytes;
|
| +
|
| + const VarList &Args = Func->getArgs();
|
| + size_t InArgsSizeBytes = 0;
|
| + unsigned NumGPRArgs = 0;
|
| + for (Variable *Arg : Args) {
|
| + Type Ty = Arg->getType();
|
| + // Skip arguments passed in registers.
|
| + if (isVectorType(Ty)) {
|
| + UnimplementedError(Func->getContext()->getFlags());
|
| + continue;
|
| + } else if (isFloatingType(Ty)) {
|
| + UnimplementedError(Func->getContext()->getFlags());
|
| + continue;
|
| + } else if (Ty == IceType_i64 && NumGPRArgs < ARM32_MAX_GPR_ARG) {
|
| + // Start at an even register.
|
| + if (NumGPRArgs % 2 == 1) {
|
| + ++NumGPRArgs;
|
| + }
|
| + NumGPRArgs += 2;
|
| + if (NumGPRArgs <= ARM32_MAX_GPR_ARG)
|
| + continue;
|
| + } else if (NumGPRArgs < ARM32_MAX_GPR_ARG) {
|
| + ++NumGPRArgs;
|
| + continue;
|
| + }
|
| + finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
|
| + }
|
| +
|
| + // Fill in stack offsets for locals.
|
| + assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
|
| + SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
|
| + UsesFramePointer);
|
| + this->HasComputedFrame = true;
|
| +
|
| + if (ALLOW_DUMP && Func->isVerbose(IceV_Frame)) {
|
| + OstreamLocker L(Func->getContext());
|
| + Ostream &Str = Func->getContext()->getStrDump();
|
| +
|
| + Str << "Stack layout:\n";
|
| + uint32_t SPAdjustmentPaddingSize =
|
| + SpillAreaSizeBytes - LocalsSpillAreaSize -
|
| + GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes;
|
| + Str << " in-args = " << InArgsSizeBytes << " bytes\n"
|
| + << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
|
| + << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
|
| + << " globals spill area = " << GlobalsSize << " bytes\n"
|
| + << " globals-locals spill areas intermediate padding = "
|
| + << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
|
| + << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
|
| + << " SP alignment padding = " << SPAdjustmentPaddingSize << " bytes\n";
|
| +
|
| + Str << "Stack details:\n"
|
| + << " SP adjustment = " << SpillAreaSizeBytes << " bytes\n"
|
| + << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
|
| + << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
|
| + << " bytes\n"
|
| + << " is FP based = " << UsesFramePointer << "\n";
|
| + }
|
| }
|
|
|
| void TargetARM32::addEpilog(CfgNode *Node) {
|
| - (void)Node;
|
| - UnimplementedError(Func->getContext()->getFlags());
|
| + InstList &Insts = Node->getInsts();
|
| + InstList::reverse_iterator RI, E;
|
| + for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
|
| + if (llvm::isa<InstARM32Ret>(*RI))
|
| + break;
|
| + }
|
| + if (RI == E)
|
| + return;
|
| +
|
| + // Convert the reverse_iterator position into its corresponding
|
| + // (forward) iterator position.
|
| + InstList::iterator InsertPoint = RI.base();
|
| + --InsertPoint;
|
| + Context.init(Node);
|
| + Context.setInsertPoint(InsertPoint);
|
| +
|
| + Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
|
| + if (UsesFramePointer) {
|
| + Variable *FP = getPhysicalRegister(RegARM32::Reg_fp);
|
| + // For late-stage liveness analysis (e.g. asm-verbose mode),
|
| + // adding a fake use of SP before the assignment of SP=FP keeps
|
| + // previous SP adjustments from being dead-code eliminated.
|
| + Context.insert(InstFakeUse::create(Func, SP));
|
| + _mov(SP, FP);
|
| + } else {
|
| + // add SP, SpillAreaSizeBytes
|
| + if (SpillAreaSizeBytes) {
|
| + // Use the IP inter-procedural scratch register if needed to legalize
|
| + // the immediate. It shouldn't be live at this point.
|
| + Operand *AddAmount = legalize(Ctx->getConstantInt32(SpillAreaSizeBytes),
|
| + Legal_Reg | Legal_Flex, RegARM32::Reg_ip);
|
| + _add(SP, SP, AddAmount);
|
| + }
|
| + }
|
| +
|
| + // Add pop instructions for preserved registers.
|
| + llvm::SmallBitVector CalleeSaves =
|
| + getRegisterSet(RegSet_CalleeSave, RegSet_None);
|
| + VarList GPRsToRestore;
|
| + GPRsToRestore.reserve(CalleeSaves.size());
|
| + // Consider FP and LR as callee-save / used as needed.
|
| + if (UsesFramePointer) {
|
| + CalleeSaves[RegARM32::Reg_fp] = true;
|
| + }
|
| + if (!IsLeafFunction) {
|
| + CalleeSaves[RegARM32::Reg_lr] = true;
|
| + }
|
| + // Pop registers in ascending order just like push
|
| + // (instead of in reverse order).
|
| + for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
|
| + if (CalleeSaves[i] && RegsUsed[i]) {
|
| + GPRsToRestore.push_back(getPhysicalRegister(i));
|
| + }
|
| + }
|
| + if (!GPRsToRestore.empty())
|
| + _pop(GPRsToRestore);
|
| +
|
| + if (!Ctx->getFlags().getUseSandboxing())
|
| + return;
|
| +
|
| + // Change the original ret instruction into a sandboxed return sequence.
|
| + // bundle_lock
|
| + // bic lr, #0xc000000f
|
| + // bx lr
|
| + // bundle_unlock
|
| + // This isn't just aligning to the getBundleAlignLog2Bytes(). It needs to
|
| + // restrict to the lower 1GB as well.
|
| + Operand *RetMask =
|
| + legalize(Ctx->getConstantInt32(0xc000000f), Legal_Reg | Legal_Flex);
|
| + Variable *LR = makeReg(IceType_i32, RegARM32::Reg_lr);
|
| + Variable *RetValue = nullptr;
|
| + if (RI->getSrcSize())
|
| + RetValue = llvm::cast<Variable>(RI->getSrc(0));
|
| + _bundle_lock();
|
| + _bic(LR, LR, RetMask);
|
| + _ret(LR, RetValue);
|
| + _bundle_unlock();
|
| + RI->setDeleted();
|
| }
|
|
|
| void TargetARM32::split64(Variable *Var) {
|
| @@ -881,6 +1224,8 @@ void TargetARM32::lowerBr(const InstBr *Inst) {
|
| }
|
|
|
| void TargetARM32::lowerCall(const InstCall *Instr) {
|
| + IsLeafFunction = false;
|
| +
|
| // TODO(jvoung): assign arguments to registers and stack. Also reserve stack.
|
| if (Instr->getNumArgs()) {
|
| UnimplementedError(Func->getContext()->getFlags());
|
| @@ -1567,12 +1912,12 @@ Variable *TargetARM32::makeReg(Type Type, int32_t RegNum) {
|
|
|
| void TargetARM32::alignRegisterPow2(Variable *Reg, uint32_t Align) {
|
| assert(llvm::isPowerOf2_32(Align));
|
| - uint32_t RotateAmt = 0;
|
| + uint32_t RotateAmt;
|
| uint32_t Immed_8;
|
| Operand *Mask;
|
| // Use AND or BIC to mask off the bits, depending on which immediate fits
|
| // (if it fits at all). Assume Align is usually small, in which case BIC
|
| - // works better.
|
| + // works better. Thus, this rounds down to the alignment.
|
| if (OperandARM32FlexImm::canHoldImm(Align - 1, &RotateAmt, &Immed_8)) {
|
| Mask = legalize(Ctx->getConstantInt32(Align - 1), Legal_Reg | Legal_Flex);
|
| _bic(Reg, Reg, Mask);
|
|
|