| Index: src/IceTargetLoweringARM32.cpp
|
| diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
|
| index 0634e452d90e6e340a3b4a54b8a745beba732efe..fef145f3b47dec98fea1db86b5da96d9b5ab4de3 100644
|
| --- a/src/IceTargetLoweringARM32.cpp
|
| +++ b/src/IceTargetLoweringARM32.cpp
|
| @@ -47,7 +47,7 @@ namespace {
|
| } while (0)
|
|
|
| // The following table summarizes the logic for lowering the icmp instruction
|
| -// for i32 and narrower types. Each icmp condition has a clear mapping to an
|
| +// for i32 and narrower types. Each icmp condition has a clear mapping to an
|
| // ARM32 conditional move instruction.
|
|
|
| const struct TableIcmp32_ {
|
| @@ -62,8 +62,8 @@ const struct TableIcmp32_ {
|
|
|
| // The following table summarizes the logic for lowering the icmp instruction
|
| // for the i64 type. Two conditional moves are needed for setting to 1 or 0.
|
| -// The operands may need to be swapped, and there is a slight difference
|
| -// for signed vs unsigned (comparing hi vs lo first, and using cmp vs sbc).
|
| +// The operands may need to be swapped, and there is a slight difference for
|
| +// signed vs unsigned (comparing hi vs lo first, and using cmp vs sbc).
|
| const struct TableIcmp64_ {
|
| bool IsSigned;
|
| bool Swapped;
|
| @@ -82,18 +82,16 @@ CondARM32::Cond getIcmp32Mapping(InstIcmp::ICond Cond) {
|
| return TableIcmp32[Index].Mapping;
|
| }
|
|
|
| -// In some cases, there are x-macros tables for both high-level and
|
| -// low-level instructions/operands that use the same enum key value.
|
| -// The tables are kept separate to maintain a proper separation
|
| -// between abstraction layers. There is a risk that the tables could
|
| -// get out of sync if enum values are reordered or if entries are
|
| -// added or deleted. The following dummy namespaces use
|
| +// In some cases, there are x-macros tables for both high-level and low-level
|
| +// instructions/operands that use the same enum key value. The tables are kept
|
| +// separate to maintain a proper separation between abstraction layers. There
|
| +// is a risk that the tables could get out of sync if enum values are reordered
|
| +// or if entries are added or deleted. The following dummy namespaces use
|
| // static_asserts to ensure everything is kept in sync.
|
|
|
| // Validate the enum values in ICMPARM32_TABLE.
|
| namespace dummy1 {
|
| -// Define a temporary set of enum values based on low-level table
|
| -// entries.
|
| +// Define a temporary set of enum values based on low-level table entries.
|
| enum _tmp_enum {
|
| #define X(val, signed, swapped64, C_32, C1_64, C2_64) _tmp_##val,
|
| ICMPARM32_TABLE
|
| @@ -104,8 +102,8 @@ enum _tmp_enum {
|
| #define X(tag, str) static const int _table1_##tag = InstIcmp::tag;
|
| ICEINSTICMP_TABLE
|
| #undef X
|
| -// Define a set of constants based on low-level table entries, and
|
| -// ensure the table entry keys are consistent.
|
| +// Define a set of constants based on low-level table entries, and ensure the
|
| +// table entry keys are consistent.
|
| #define X(val, signed, swapped64, C_32, C1_64, C2_64) \
|
| static const int _table2_##val = _tmp_##val; \
|
| static_assert( \
|
| @@ -113,8 +111,8 @@ ICEINSTICMP_TABLE
|
| "Inconsistency between ICMPARM32_TABLE and ICEINSTICMP_TABLE");
|
| ICMPARM32_TABLE
|
| #undef X
|
| -// Repeat the static asserts with respect to the high-level table
|
| -// entries in case the high-level table has extra entries.
|
| +// Repeat the static asserts with respect to the high-level table entries in
|
| +// case the high-level table has extra entries.
|
| #define X(tag, str) \
|
| static_assert( \
|
| _table1_##tag == _table2_##tag, \
|
| @@ -126,17 +124,17 @@ ICEINSTICMP_TABLE
|
| // Stack alignment
|
| const uint32_t ARM32_STACK_ALIGNMENT_BYTES = 16;
|
|
|
| -// Value is in bytes. Return Value adjusted to the next highest multiple
|
| -// of the stack alignment.
|
| +// Value is in bytes. Return Value adjusted to the next highest multiple of the
|
| +// stack alignment.
|
| uint32_t applyStackAlignment(uint32_t Value) {
|
| return Utils::applyAlignment(Value, ARM32_STACK_ALIGNMENT_BYTES);
|
| }
|
|
|
| -// Value is in bytes. Return Value adjusted to the next highest multiple
|
| -// of the stack alignment required for the given type.
|
| +// Value is in bytes. Return Value adjusted to the next highest multiple of the
|
| +// stack alignment required for the given type.
|
| uint32_t applyStackAlignmentTy(uint32_t Value, Type Ty) {
|
| - // Use natural alignment, except that normally (non-NaCl) ARM only
|
| - // aligns vectors to 8 bytes.
|
| + // Use natural alignment, except that normally (non-NaCl) ARM only aligns
|
| + // vectors to 8 bytes.
|
| // TODO(jvoung): Check this ...
|
| size_t typeAlignInBytes = typeWidthInBytes(Ty);
|
| if (isVectorType(Ty))
|
| @@ -172,9 +170,8 @@ TargetARM32Features::TargetARM32Features(const ClFlags &Flags) {
|
|
|
| TargetARM32::TargetARM32(Cfg *Func)
|
| : TargetLowering(Func), CPUFeatures(Func->getContext()->getFlags()) {
|
| - // TODO: Don't initialize IntegerRegisters and friends every time.
|
| - // Instead, initialize in some sort of static initializer for the
|
| - // class.
|
| + // TODO: Don't initialize IntegerRegisters and friends every time. Instead,
|
| + // initialize in some sort of static initializer for the class.
|
| // Limit this size (or do all bitsets need to be the same width)???
|
| llvm::SmallBitVector IntegerRegisters(RegARM32::Reg_NUM);
|
| llvm::SmallBitVector Float32Registers(RegARM32::Reg_NUM);
|
| @@ -243,19 +240,18 @@ void TargetARM32::translateO2() {
|
| // Argument lowering
|
| Func->doArgLowering();
|
|
|
| - // Target lowering. This requires liveness analysis for some parts
|
| - // of the lowering decisions, such as compare/branch fusing. If
|
| - // non-lightweight liveness analysis is used, the instructions need
|
| - // to be renumbered first. TODO: This renumbering should only be
|
| - // necessary if we're actually calculating live intervals, which we
|
| - // only do for register allocation.
|
| + // Target lowering. This requires liveness analysis for some parts of the
|
| + // lowering decisions, such as compare/branch fusing. If non-lightweight
|
| + // liveness analysis is used, the instructions need to be renumbered first.
|
| + // TODO: This renumbering should only be necessary if we're actually
|
| + // calculating live intervals, which we only do for register allocation.
|
| Func->renumberInstructions();
|
| if (Func->hasError())
|
| return;
|
|
|
| - // TODO: It should be sufficient to use the fastest liveness
|
| - // calculation, i.e. livenessLightweight(). However, for some
|
| - // reason that slows down the rest of the translation. Investigate.
|
| + // TODO: It should be sufficient to use the fastest liveness calculation,
|
| + // i.e. livenessLightweight(). However, for some reason that slows down the
|
| + // rest of the translation. Investigate.
|
| Func->liveness(Liveness_Basic);
|
| if (Func->hasError())
|
| return;
|
| @@ -266,19 +262,19 @@ void TargetARM32::translateO2() {
|
| return;
|
| Func->dump("After ARM32 codegen");
|
|
|
| - // Register allocation. This requires instruction renumbering and
|
| - // full liveness analysis.
|
| + // Register allocation. This requires instruction renumbering and full
|
| + // liveness analysis.
|
| Func->renumberInstructions();
|
| if (Func->hasError())
|
| return;
|
| Func->liveness(Liveness_Intervals);
|
| if (Func->hasError())
|
| return;
|
| - // Validate the live range computations. The expensive validation
|
| - // call is deliberately only made when assertions are enabled.
|
| + // Validate the live range computations. The expensive validation call is
|
| + // deliberately only made when assertions are enabled.
|
| assert(Func->validateLiveness());
|
| - // The post-codegen dump is done here, after liveness analysis and
|
| - // associated cleanup, to make the dump cleaner and more useful.
|
| + // The post-codegen dump is done here, after liveness analysis and associated
|
| + // cleanup, to make the dump cleaner and more useful.
|
| Func->dump("After initial ARM32 codegen");
|
| Func->getVMetadata()->init(VMK_All);
|
| regAlloc(RAK_Global);
|
| @@ -305,11 +301,10 @@ void TargetARM32::translateO2() {
|
| Func->contractEmptyNodes();
|
| Func->reorderNodes();
|
|
|
| - // Branch optimization. This needs to be done just before code
|
| - // emission. In particular, no transformations that insert or
|
| - // reorder CfgNodes should be done after branch optimization. We go
|
| - // ahead and do it before nop insertion to reduce the amount of work
|
| - // needed for searching for opportunities.
|
| + // Branch optimization. This needs to be done just before code emission. In
|
| + // particular, no transformations that insert or reorder CfgNodes should be
|
| + // done after branch optimization. We go ahead and do it before nop insertion
|
| + // to reduce the amount of work needed for searching for opportunities.
|
| Func->doBranchOpt();
|
| Func->dump("After branch optimization");
|
|
|
| @@ -395,8 +390,8 @@ Variable *TargetARM32::getPhysicalRegister(SizeT RegNum, Type Ty) {
|
| Reg = Func->makeVariable(Ty);
|
| Reg->setRegNum(RegNum);
|
| PhysicalRegisters[Ty][RegNum] = Reg;
|
| - // Specially mark SP and LR as an "argument" so that it is considered
|
| - // live upon function entry.
|
| + // Specially mark SP and LR as an "argument" so that it is considered live
|
| + // upon function entry.
|
| if (RegNum == RegARM32::Reg_sp || RegNum == RegARM32::Reg_lr) {
|
| Func->addImplicitArg(Reg);
|
| Reg->setIgnoreLiveness();
|
| @@ -445,15 +440,15 @@ bool TargetARM32::CallingConv::I64InRegs(std::pair<int32_t, int32_t> *Regs) {
|
| if (NumGPRRegsUsed >= ARM32_MAX_GPR_ARG)
|
| return false;
|
| int32_t RegLo, RegHi;
|
| - // Always start i64 registers at an even register, so this may end
|
| - // up padding away a register.
|
| + // Always start i64 registers at an even register, so this may end up padding
|
| + // away a register.
|
| NumGPRRegsUsed = Utils::applyAlignment(NumGPRRegsUsed, 2);
|
| RegLo = RegARM32::Reg_r0 + NumGPRRegsUsed;
|
| ++NumGPRRegsUsed;
|
| RegHi = RegARM32::Reg_r0 + NumGPRRegsUsed;
|
| ++NumGPRRegsUsed;
|
| - // If this bumps us past the boundary, don't allocate to a register
|
| - // and leave any previously speculatively consumed registers as consumed.
|
| + // If this bumps us past the boundary, don't allocate to a register and leave
|
| + // any previously speculatively consumed registers as consumed.
|
| if (NumGPRRegsUsed > ARM32_MAX_GPR_ARG)
|
| return false;
|
| Regs->first = RegLo;
|
| @@ -474,15 +469,15 @@ bool TargetARM32::CallingConv::FPInReg(Type Ty, int32_t *Reg) {
|
| return false;
|
| if (isVectorType(Ty)) {
|
| NumFPRegUnits = Utils::applyAlignment(NumFPRegUnits, 4);
|
| - // Q registers are declared in reverse order, so
|
| - // RegARM32::Reg_q0 > RegARM32::Reg_q1. Therefore, we need to subtract
|
| - // NumFPRegUnits from Reg_q0. Same thing goes for D registers.
|
| + // Q registers are declared in reverse order, so RegARM32::Reg_q0 >
|
| + // RegARM32::Reg_q1. Therefore, we need to subtract NumFPRegUnits from
|
| + // Reg_q0. Same thing goes for D registers.
|
| static_assert(RegARM32::Reg_q0 > RegARM32::Reg_q1,
|
| "ARM32 Q registers are possibly declared incorrectly.");
|
| *Reg = RegARM32::Reg_q0 - (NumFPRegUnits / 4);
|
| NumFPRegUnits += 4;
|
| - // If this bumps us past the boundary, don't allocate to a register
|
| - // and leave any previously speculatively consumed registers as consumed.
|
| + // If this bumps us past the boundary, don't allocate to a register and
|
| + // leave any previously speculatively consumed registers as consumed.
|
| if (NumFPRegUnits > ARM32_MAX_FP_REG_UNITS)
|
| return false;
|
| } else if (Ty == IceType_f64) {
|
| @@ -491,8 +486,8 @@ bool TargetARM32::CallingConv::FPInReg(Type Ty, int32_t *Reg) {
|
| NumFPRegUnits = Utils::applyAlignment(NumFPRegUnits, 2);
|
| *Reg = RegARM32::Reg_d0 - (NumFPRegUnits / 2);
|
| NumFPRegUnits += 2;
|
| - // If this bumps us past the boundary, don't allocate to a register
|
| - // and leave any previously speculatively consumed registers as consumed.
|
| + // If this bumps us past the boundary, don't allocate to a register and
|
| + // leave any previously speculatively consumed registers as consumed.
|
| if (NumFPRegUnits > ARM32_MAX_FP_REG_UNITS)
|
| return false;
|
| } else {
|
| @@ -509,9 +504,9 @@ void TargetARM32::lowerArguments() {
|
| VarList &Args = Func->getArgs();
|
| TargetARM32::CallingConv CC;
|
|
|
| - // For each register argument, replace Arg in the argument list with the
|
| - // home register. Then generate an instruction in the prolog to copy the
|
| - // home register to the assigned location of Arg.
|
| + // For each register argument, replace Arg in the argument list with the home
|
| + // register. Then generate an instruction in the prolog to copy the home
|
| + // register to the assigned location of Arg.
|
| Context.init(Func->getEntryNode());
|
| Context.setInsertPoint(Context.getCur());
|
|
|
| @@ -568,13 +563,12 @@ void TargetARM32::lowerArguments() {
|
|
|
| // Helper function for addProlog().
|
| //
|
| -// This assumes Arg is an argument passed on the stack. This sets the
|
| -// frame offset for Arg and updates InArgsSizeBytes according to Arg's
|
| -// width. For an I64 arg that has been split into Lo and Hi components,
|
| -// it calls itself recursively on the components, taking care to handle
|
| -// Lo first because of the little-endian architecture. Lastly, this
|
| -// function generates an instruction to copy Arg into its assigned
|
| -// register if applicable.
|
| +// This assumes Arg is an argument passed on the stack. This sets the frame
|
| +// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an
|
| +// I64 arg that has been split into Lo and Hi components, it calls itself
|
| +// recursively on the components, taking care to handle Lo first because of the
|
| +// little-endian architecture. Lastly, this function generates an instruction
|
| +// to copy Arg into its assigned register if applicable.
|
| void TargetARM32::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
|
| size_t BasicFrameOffset,
|
| size_t &InArgsSizeBytes) {
|
| @@ -591,8 +585,8 @@ void TargetARM32::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
|
| InArgsSizeBytes = applyStackAlignmentTy(InArgsSizeBytes, Ty);
|
| Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
|
| InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
|
| - // If the argument variable has been assigned a register, we need to load
|
| - // the value from the stack slot.
|
| + // If the argument variable has been assigned a register, we need to load the
|
| + // value from the stack slot.
|
| if (Arg->hasReg()) {
|
| assert(Ty != IceType_i64);
|
| OperandARM32Mem *Mem = OperandARM32Mem::create(
|
| @@ -606,10 +600,9 @@ void TargetARM32::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
|
| } else {
|
| _ldr(Arg, Mem);
|
| }
|
| - // This argument-copying instruction uses an explicit
|
| - // OperandARM32Mem operand instead of a Variable, so its
|
| - // fill-from-stack operation has to be tracked separately for
|
| - // statistics.
|
| + // This argument-copying instruction uses an explicit OperandARM32Mem
|
| + // operand instead of a Variable, so its fill-from-stack operation has to
|
| + // be tracked separately for statistics.
|
| Ctx->statsUpdateFills();
|
| }
|
| }
|
| @@ -642,16 +635,15 @@ void TargetARM32::addProlog(CfgNode *Node) {
|
| // * GlobalsAndSubsequentPaddingSize: areas 3 - 4
|
| // * LocalsSpillAreaSize: area 5
|
| // * SpillAreaSizeBytes: areas 2 - 6
|
| - // Determine stack frame offsets for each Variable without a
|
| - // register assignment. This can be done as one variable per stack
|
| - // slot. Or, do coalescing by running the register allocator again
|
| - // with an infinite set of registers (as a side effect, this gives
|
| - // variables a second chance at physical register assignment).
|
| + // Determine stack frame offsets for each Variable without a register
|
| + // assignment. This can be done as one variable per stack slot. Or, do
|
| + // coalescing by running the register allocator again with an infinite set of
|
| + // registers (as a side effect, this gives variables a second chance at
|
| + // physical register assignment).
|
| //
|
| - // A middle ground approach is to leverage sparsity and allocate one
|
| - // block of space on the frame for globals (variables with
|
| - // multi-block lifetime), and one block to share for locals
|
| - // (single-block lifetime).
|
| + // A middle ground approach is to leverage sparsity and allocate one block of
|
| + // space on the frame for globals (variables with multi-block lifetime), and
|
| + // one block to share for locals (single-block lifetime).
|
|
|
| Context.init(Node);
|
| Context.setInsertPoint(Context.getCur());
|
| @@ -661,14 +653,13 @@ void TargetARM32::addProlog(CfgNode *Node) {
|
| RegsUsed = llvm::SmallBitVector(CalleeSaves.size());
|
| VarList SortedSpilledVariables;
|
| size_t GlobalsSize = 0;
|
| - // If there is a separate locals area, this represents that area.
|
| - // Otherwise it counts any variable not counted by GlobalsSize.
|
| + // If there is a separate locals area, this represents that area. Otherwise
|
| + // it counts any variable not counted by GlobalsSize.
|
| SpillAreaSizeBytes = 0;
|
| - // If there is a separate locals area, this specifies the alignment
|
| - // for it.
|
| + // If there is a separate locals area, this specifies the alignment for it.
|
| uint32_t LocalsSlotsAlignmentBytes = 0;
|
| - // The entire spill locations area gets aligned to largest natural
|
| - // alignment of the variables that have a spill slot.
|
| + // The entire spill locations area gets aligned to largest natural alignment
|
| + // of the variables that have a spill slot.
|
| uint32_t SpillAreaAlignmentBytes = 0;
|
| // For now, we don't have target-specific variables that need special
|
| // treatment (no stack-slot-linked SpillVariable type).
|
| @@ -682,12 +673,11 @@ void TargetARM32::addProlog(CfgNode *Node) {
|
| uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
|
| SpillAreaSizeBytes += GlobalsSize;
|
|
|
| - // Add push instructions for preserved registers.
|
| - // On ARM, "push" can push a whole list of GPRs via a bitmask (0-15).
|
| - // Unlike x86, ARM also has callee-saved float/vector registers.
|
| - // The "vpush" instruction can handle a whole list of float/vector
|
| - // registers, but it only handles contiguous sequences of registers
|
| - // by specifying the start and the length.
|
| + // Add push instructions for preserved registers. On ARM, "push" can push a
|
| + // whole list of GPRs via a bitmask (0-15). Unlike x86, ARM also has
|
| + // callee-saved float/vector registers. The "vpush" instruction can handle a
|
| + // whole list of float/vector registers, but it only handles contiguous
|
| + // sequences of registers by specifying the start and the length.
|
| VarList GPRsToPreserve;
|
| GPRsToPreserve.reserve(CalleeSaves.size());
|
| uint32_t NumCallee = 0;
|
| @@ -704,8 +694,8 @@ void TargetARM32::addProlog(CfgNode *Node) {
|
| }
|
| for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
|
| if (CalleeSaves[i] && RegsUsed[i]) {
|
| - // TODO(jvoung): do separate vpush for each floating point
|
| - // register segment and += 4, or 8 depending on type.
|
| + // TODO(jvoung): do separate vpush for each floating point register
|
| + // segment and += 4, or 8 depending on type.
|
| ++NumCallee;
|
| PreservedRegsSizeBytes += 4;
|
| GPRsToPreserve.push_back(getPhysicalRegister(i));
|
| @@ -724,10 +714,10 @@ void TargetARM32::addProlog(CfgNode *Node) {
|
| Context.insert(InstFakeUse::create(Func, FP));
|
| }
|
|
|
| - // Align the variables area. SpillAreaPaddingBytes is the size of
|
| - // the region after the preserved registers and before the spill areas.
|
| - // LocalsSlotsPaddingBytes is the amount of padding between the globals
|
| - // and locals area if they are separate.
|
| + // Align the variables area. SpillAreaPaddingBytes is the size of the region
|
| + // after the preserved registers and before the spill areas.
|
| + // LocalsSlotsPaddingBytes is the amount of padding between the globals and
|
| + // locals area if they are separate.
|
| assert(SpillAreaAlignmentBytes <= ARM32_STACK_ALIGNMENT_BYTES);
|
| assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
|
| uint32_t SpillAreaPaddingBytes = 0;
|
| @@ -758,9 +748,9 @@ void TargetARM32::addProlog(CfgNode *Node) {
|
|
|
| resetStackAdjustment();
|
|
|
| - // Fill in stack offsets for stack args, and copy args into registers
|
| - // for those that were register-allocated. Args are pushed right to
|
| - // left, so Arg[0] is closest to the stack/frame pointer.
|
| + // Fill in stack offsets for stack args, and copy args into registers for
|
| + // those that were register-allocated. Args are pushed right to left, so
|
| + // Arg[0] is closest to the stack/frame pointer.
|
| Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg());
|
| size_t BasicFrameOffset = PreservedRegsSizeBytes;
|
| if (!UsesFramePointer)
|
| @@ -830,8 +820,8 @@ void TargetARM32::addEpilog(CfgNode *Node) {
|
| if (RI == E)
|
| return;
|
|
|
| - // Convert the reverse_iterator position into its corresponding
|
| - // (forward) iterator position.
|
| + // Convert the reverse_iterator position into its corresponding (forward)
|
| + // iterator position.
|
| InstList::iterator InsertPoint = RI.base();
|
| --InsertPoint;
|
| Context.init(Node);
|
| @@ -840,9 +830,9 @@ void TargetARM32::addEpilog(CfgNode *Node) {
|
| Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
|
| if (UsesFramePointer) {
|
| Variable *FP = getPhysicalRegister(RegARM32::Reg_fp);
|
| - // For late-stage liveness analysis (e.g. asm-verbose mode),
|
| - // adding a fake use of SP before the assignment of SP=FP keeps
|
| - // previous SP adjustments from being dead-code eliminated.
|
| + // For late-stage liveness analysis (e.g. asm-verbose mode), adding a fake
|
| + // use of SP before the assignment of SP=FP keeps previous SP adjustments
|
| + // from being dead-code eliminated.
|
| Context.insert(InstFakeUse::create(Func, SP));
|
| _mov(SP, FP);
|
| } else {
|
| @@ -868,8 +858,8 @@ void TargetARM32::addEpilog(CfgNode *Node) {
|
| if (!MaybeLeafFunc) {
|
| CalleeSaves[RegARM32::Reg_lr] = true;
|
| }
|
| - // Pop registers in ascending order just like push
|
| - // (instead of in reverse order).
|
| + // Pop registers in ascending order just like push (instead of in reverse
|
| + // order).
|
| for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
|
| if (CalleeSaves[i] && RegsUsed[i]) {
|
| GPRsToRestore.push_back(getPhysicalRegister(i));
|
| @@ -903,17 +893,16 @@ void TargetARM32::addEpilog(CfgNode *Node) {
|
|
|
| bool TargetARM32::isLegalVariableStackOffset(int32_t Offset) const {
|
| constexpr bool SignExt = false;
|
| - // TODO(jvoung): vldr of FP stack slots has a different limit from the
|
| - // plain stackSlotType().
|
| + // TODO(jvoung): vldr of FP stack slots has a different limit from the plain
|
| + // stackSlotType().
|
| return OperandARM32Mem::canHoldOffset(stackSlotType(), SignExt, Offset);
|
| }
|
|
|
| StackVariable *TargetARM32::legalizeVariableSlot(Variable *Var,
|
| Variable *OrigBaseReg) {
|
| int32_t Offset = Var->getStackOffset();
|
| - // Legalize will likely need a movw/movt combination, but if the top
|
| - // bits are all 0 from negating the offset and subtracting, we could
|
| - // use that instead.
|
| + // Legalize will likely need a movw/movt combination, but if the top bits are
|
| + // all 0 from negating the offset and subtracting, we could use that instead.
|
| bool ShouldSub = (-Offset & 0xFFFF0000) == 0;
|
| if (ShouldSub)
|
| Offset = -Offset;
|
| @@ -949,15 +938,15 @@ void TargetARM32::legalizeStackSlots() {
|
| return;
|
| Variable *OrigBaseReg = getPhysicalRegister(getFrameOrStackReg());
|
| int32_t StackAdjust = 0;
|
| - // Do a fairly naive greedy clustering for now. Pick the first stack slot
|
| + // Do a fairly naive greedy clustering for now. Pick the first stack slot
|
| // that's out of bounds and make a new base reg using the architecture's temp
|
| - // register. If that works for the next slot, then great. Otherwise, create
|
| - // a new base register, clobbering the previous base register. Never share a
|
| - // base reg across different basic blocks. This isn't ideal if local and
|
| + // register. If that works for the next slot, then great. Otherwise, create a
|
| + // new base register, clobbering the previous base register. Never share a
|
| + // base reg across different basic blocks. This isn't ideal if local and
|
| // multi-block variables are far apart and their references are interspersed.
|
| - // It may help to be more coordinated about assign stack slot numbers
|
| - // and may help to assign smaller offsets to higher-weight variables
|
| - // so that they don't depend on this legalization.
|
| + // It may help to be more coordinated about assign stack slot numbers and may
|
| + // help to assign smaller offsets to higher-weight variables so that they
|
| + // don't depend on this legalization.
|
| for (CfgNode *Node : Func->getNodes()) {
|
| Context.init(Node);
|
| StackVariable *NewBaseReg = nullptr;
|
| @@ -986,7 +975,7 @@ void TargetARM32::legalizeStackSlots() {
|
| continue;
|
| }
|
| }
|
| - // For now, only Mov instructions can have stack variables. We need to
|
| + // For now, only Mov instructions can have stack variables. We need to
|
| // know the type of instruction because we currently create a fresh one
|
| // to replace Dest/Source, rather than mutate in place.
|
| auto *MovInst = llvm::dyn_cast<InstARM32Mov>(CurInstr);
|
| @@ -1117,15 +1106,15 @@ Operand *TargetARM32::hiOperand(Operand *Operand) {
|
| static_cast<uint32_t>(Const->getValue() >> 32));
|
| }
|
| if (auto *Mem = llvm::dyn_cast<OperandARM32Mem>(Operand)) {
|
| - // Conservatively disallow memory operands with side-effects
|
| - // in case of duplication.
|
| + // Conservatively disallow memory operands with side-effects in case of
|
| + // duplication.
|
| assert(Mem->getAddrMode() == OperandARM32Mem::Offset ||
|
| Mem->getAddrMode() == OperandARM32Mem::NegOffset);
|
| const Type SplitType = IceType_i32;
|
| if (Mem->isRegReg()) {
|
| // We have to make a temp variable T, and add 4 to either Base or Index.
|
| - // The Index may be shifted, so adding 4 can mean something else.
|
| - // Thus, prefer T := Base + 4, and use T as the new Base.
|
| + // The Index may be shifted, so adding 4 can mean something else. Thus,
|
| + // prefer T := Base + 4, and use T as the new Base.
|
| Variable *Base = Mem->getBase();
|
| Constant *Four = Ctx->getConstantInt32(4);
|
| Variable *NewBase = Func->makeVariable(Base->getType());
|
| @@ -1144,8 +1133,8 @@ Operand *TargetARM32::hiOperand(Operand *Operand) {
|
| // We have to make a temp variable and add 4 to either Base or Offset.
|
| // If we add 4 to Offset, this will convert a non-RegReg addressing
|
| // mode into a RegReg addressing mode. Since NaCl sandboxing disallows
|
| - // RegReg addressing modes, prefer adding to base and replacing instead.
|
| - // Thus we leave the old offset alone.
|
| + // RegReg addressing modes, prefer adding to base and replacing
|
| + // instead. Thus we leave the old offset alone.
|
| Constant *Four = Ctx->getConstantInt32(4);
|
| Variable *NewBase = Func->makeVariable(Base->getType());
|
| lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add,
|
| @@ -1195,11 +1184,11 @@ llvm::SmallBitVector TargetARM32::getRegisterSet(RegSetMask Include,
|
|
|
| void TargetARM32::lowerAlloca(const InstAlloca *Inst) {
|
| UsesFramePointer = true;
|
| - // Conservatively require the stack to be aligned. Some stack
|
| - // adjustment operations implemented below assume that the stack is
|
| - // aligned before the alloca. All the alloca code ensures that the
|
| - // stack alignment is preserved after the alloca. The stack alignment
|
| - // restriction can be relaxed in some cases.
|
| + // Conservatively require the stack to be aligned. Some stack adjustment
|
| + // operations implemented below assume that the stack is aligned before the
|
| + // alloca. All the alloca code ensures that the stack alignment is preserved
|
| + // after the alloca. The stack alignment restriction can be relaxed in some
|
| + // cases.
|
| NeedsStackAlignment = true;
|
|
|
| // TODO(stichnot): minimize the number of adjustments of SP, etc.
|
| @@ -1226,8 +1215,8 @@ void TargetARM32::lowerAlloca(const InstAlloca *Inst) {
|
| Operand *SubAmount = legalize(Ctx->getConstantInt32(Value));
|
| _sub(SP, SP, SubAmount);
|
| } else {
|
| - // Non-constant sizes need to be adjusted to the next highest
|
| - // multiple of the required alignment at runtime.
|
| + // Non-constant sizes need to be adjusted to the next highest multiple of
|
| + // the required alignment at runtime.
|
| TotalSize = legalize(TotalSize, Legal_Reg | Legal_Flex);
|
| Variable *T = makeReg(IceType_i32);
|
| _mov(T, TotalSize);
|
| @@ -1265,8 +1254,8 @@ void TargetARM32::div0Check(Type Ty, Operand *SrcLo, Operand *SrcHi) {
|
| case IceType_i64: {
|
| Variable *ScratchReg = makeReg(IceType_i32);
|
| _orrs(ScratchReg, SrcLoReg, SrcHi);
|
| - // ScratchReg isn't going to be used, but we need the
|
| - // side-effect of setting flags from this operation.
|
| + // ScratchReg isn't going to be used, but we need the side-effect of
|
| + // setting flags from this operation.
|
| Context.insert(InstFakeUse::create(Func, ScratchReg));
|
| }
|
| }
|
| @@ -1310,21 +1299,21 @@ void TargetARM32::lowerIDivRem(Variable *Dest, Variable *T, Variable *Src0R,
|
|
|
| void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
|
| Variable *Dest = Inst->getDest();
|
| - // TODO(jvoung): Should be able to flip Src0 and Src1 if it is easier
|
| - // to legalize Src0 to flex or Src1 to flex and there is a reversible
|
| - // instruction. E.g., reverse subtract with immediate, register vs
|
| - // register, immediate.
|
| - // Or it may be the case that the operands aren't swapped, but the
|
| - // bits can be flipped and a different operation applied.
|
| - // E.g., use BIC (bit clear) instead of AND for some masks.
|
| + // TODO(jvoung): Should be able to flip Src0 and Src1 if it is easier to
|
| + // legalize Src0 to flex or Src1 to flex and there is a reversible
|
| + // instruction. E.g., reverse subtract with immediate, register vs register,
|
| + // immediate.
|
| + // Or it may be the case that the operands aren't swapped, but the bits can
|
| + // be flipped and a different operation applied. E.g., use BIC (bit clear)
|
| + // instead of AND for some masks.
|
| Operand *Src0 = legalizeUndef(Inst->getSrc(0));
|
| Operand *Src1 = legalizeUndef(Inst->getSrc(1));
|
| if (Dest->getType() == IceType_i64) {
|
| - // These helper-call-involved instructions are lowered in this
|
| - // separate switch. This is because we would otherwise assume that
|
| - // we need to legalize Src0 to Src0RLo and Src0Hi. However, those go unused
|
| - // with helper calls, and such unused/redundant instructions will fail
|
| - // liveness analysis under -Om1 setting.
|
| + // These helper-call-involved instructions are lowered in this separate
|
| + // switch. This is because we would otherwise assume that we need to
|
| + // legalize Src0 to Src0RLo and Src0Hi. However, those go unused with
|
| + // helper calls, and such unused/redundant instructions will fail liveness
|
| + // analysis under -Om1 setting.
|
| switch (Inst->getOp()) {
|
| default:
|
| break;
|
| @@ -1332,11 +1321,10 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
|
| case InstArithmetic::Sdiv:
|
| case InstArithmetic::Urem:
|
| case InstArithmetic::Srem: {
|
| - // Check for divide by 0 (ARM normally doesn't trap, but we want it
|
| - // to trap for NaCl). Src1Lo and Src1Hi may have already been legalized
|
| - // to a register, which will hide a constant source operand.
|
| - // Instead, check the not-yet-legalized Src1 to optimize-out a divide
|
| - // by 0 check.
|
| + // Check for divide by 0 (ARM normally doesn't trap, but we want it to
|
| + // trap for NaCl). Src1Lo and Src1Hi may have already been legalized to a
|
| + // register, which will hide a constant source operand. Instead, check
|
| + // the not-yet-legalized Src1 to optimize-out a divide by 0 check.
|
| if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Src1)) {
|
| if (C64->getValue() == 0) {
|
| _trap();
|
| @@ -1348,8 +1336,8 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
|
| div0Check(IceType_i64, Src1Lo, Src1Hi);
|
| }
|
| // Technically, ARM has their own aeabi routines, but we can use the
|
| - // non-aeabi routine as well. LLVM uses __aeabi_ldivmod for div,
|
| - // but uses the more standard __moddi3 for rem.
|
| + // non-aeabi routine as well. LLVM uses __aeabi_ldivmod for div, but uses
|
| + // the more standard __moddi3 for rem.
|
| const char *HelperName = "";
|
| switch (Inst->getOp()) {
|
| default:
|
| @@ -1472,12 +1460,11 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
|
| // lsl t_lo, b.lo, c.lo
|
| // a.lo = t_lo
|
| // a.hi = t_hi
|
| - // Can be strength-reduced for constant-shifts, but we don't do
|
| - // that for now.
|
| - // Given the sub/rsb T_C, C.lo, #32, one of the T_C will be negative.
|
| - // On ARM, shifts only take the lower 8 bits of the shift register,
|
| - // and saturate to the range 0-32, so the negative value will
|
| - // saturate to 32.
|
| + // Can be strength-reduced for constant-shifts, but we don't do that for
|
| + // now.
|
| + // Given the sub/rsb T_C, C.lo, #32, one of the T_C will be negative. On
|
| + // ARM, shifts only take the lower 8 bits of the shift register, and
|
| + // saturate to the range 0-32, so the negative value will saturate to 32.
|
| Variable *T_Hi = makeReg(IceType_i32);
|
| Variable *Src1RLo = legalizeToReg(Src1Lo);
|
| Constant *ThirtyTwo = Ctx->getConstantInt32(32);
|
| @@ -1493,8 +1480,8 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
|
| _mov(DestHi, T_Hi);
|
| Variable *T_Lo = makeReg(IceType_i32);
|
| // _mov seems to sometimes have better register preferencing than lsl.
|
| - // Otherwise mov w/ lsl shifted register is a pseudo-instruction
|
| - // that maps to lsl.
|
| + // Otherwise mov w/ lsl shifted register is a pseudo-instruction that
|
| + // maps to lsl.
|
| _mov(T_Lo, OperandARM32FlexReg::create(Func, IceType_i32, Src0RLo,
|
| OperandARM32::LSL, Src1RLo));
|
| _mov(DestLo, T_Lo);
|
| @@ -1513,9 +1500,9 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
|
| // a.hi = t_hi
|
| case InstArithmetic::Ashr: {
|
| // a=b>>c (signed) ==> ...
|
| - // Ashr is similar, but the sub t_c2, c.lo, #32 should set flags,
|
| - // and the next orr should be conditioned on PLUS. The last two
|
| - // right shifts should also be arithmetic.
|
| + // Ashr is similar, but the sub t_c2, c.lo, #32 should set flags, and the
|
| + // next orr should be conditioned on PLUS. The last two right shifts
|
| + // should also be arithmetic.
|
| bool IsAshr = Inst->getOp() == InstArithmetic::Ashr;
|
| Variable *T_Lo = makeReg(IceType_i32);
|
| Variable *Src1RLo = legalizeToReg(Src1Lo);
|
| @@ -1723,13 +1710,13 @@ void TargetARM32::lowerAssign(const InstAssign *Inst) {
|
| Operand *NewSrc;
|
| if (Dest->hasReg()) {
|
| // If Dest already has a physical register, then legalize the Src operand
|
| - // into a Variable with the same register assignment. This especially
|
| + // into a Variable with the same register assignment. This especially
|
| // helps allow the use of Flex operands.
|
| NewSrc = legalize(Src0, Legal_Reg | Legal_Flex, Dest->getRegNum());
|
| } else {
|
| - // Dest could be a stack operand. Since we could potentially need
|
| - // to do a Store (and store can only have Register operands),
|
| - // legalize this to a register.
|
| + // Dest could be a stack operand. Since we could potentially need to do a
|
| + // Store (and store can only have Register operands), legalize this to a
|
| + // register.
|
| NewSrc = legalize(Src0, Legal_Reg);
|
| }
|
| if (isVectorType(Dest->getType())) {
|
| @@ -1810,25 +1797,24 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
|
| }
|
| }
|
|
|
| - // Adjust the parameter area so that the stack is aligned. It is
|
| - // assumed that the stack is already aligned at the start of the
|
| - // calling sequence.
|
| + // Adjust the parameter area so that the stack is aligned. It is assumed that
|
| + // the stack is already aligned at the start of the calling sequence.
|
| ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
|
|
|
| - // Subtract the appropriate amount for the argument area. This also
|
| - // takes care of setting the stack adjustment during emission.
|
| + // Subtract the appropriate amount for the argument area. This also takes
|
| + // care of setting the stack adjustment during emission.
|
| //
|
| - // TODO: If for some reason the call instruction gets dead-code
|
| - // eliminated after lowering, we would need to ensure that the
|
| - // pre-call and the post-call esp adjustment get eliminated as well.
|
| + // TODO: If for some reason the call instruction gets dead-code eliminated
|
| + // after lowering, we would need to ensure that the pre-call and the
|
| + // post-call esp adjustment get eliminated as well.
|
| if (ParameterAreaSizeBytes) {
|
| Operand *SubAmount = legalize(Ctx->getConstantInt32(ParameterAreaSizeBytes),
|
| Legal_Reg | Legal_Flex);
|
| _adjust_stack(ParameterAreaSizeBytes, SubAmount);
|
| }
|
|
|
| - // Copy arguments that are passed on the stack to the appropriate
|
| - // stack locations.
|
| + // Copy arguments that are passed on the stack to the appropriate stack
|
| + // locations.
|
| Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
|
| for (auto &StackArg : StackArgs) {
|
| ConstantInteger32 *Loc =
|
| @@ -1850,9 +1836,9 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
|
| // Copy arguments to be passed in registers to the appropriate registers.
|
| for (auto &GPRArg : GPRArgs) {
|
| Variable *Reg = legalizeToReg(GPRArg.first, GPRArg.second);
|
| - // Generate a FakeUse of register arguments so that they do not get
|
| - // dead code eliminated as a result of the FakeKill of scratch
|
| - // registers after the call.
|
| + // Generate a FakeUse of register arguments so that they do not get dead
|
| + // code eliminated as a result of the FakeKill of scratch registers after
|
| + // the call.
|
| Context.insert(InstFakeUse::create(Func, Reg));
|
| }
|
| for (auto &FPArg : FPArgs) {
|
| @@ -1860,8 +1846,8 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
|
| Context.insert(InstFakeUse::create(Func, Reg));
|
| }
|
|
|
| - // Generate the call instruction. Assign its result to a temporary
|
| - // with high register allocation weight.
|
| + // Generate the call instruction. Assign its result to a temporary with high
|
| + // register allocation weight.
|
| Variable *Dest = Instr->getDest();
|
| // ReturnReg doubles as ReturnRegLo as necessary.
|
| Variable *ReturnReg = nullptr;
|
| @@ -1901,12 +1887,12 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
|
| }
|
| }
|
| Operand *CallTarget = Instr->getCallTarget();
|
| - // TODO(jvoung): Handle sandboxing.
|
| - // const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
|
| + // TODO(jvoung): Handle sandboxing. const bool NeedSandboxing =
|
| + // Ctx->getFlags().getUseSandboxing();
|
|
|
| - // Allow ConstantRelocatable to be left alone as a direct call,
|
| - // but force other constants like ConstantInteger32 to be in
|
| - // a register and make it an indirect call.
|
| + // Allow ConstantRelocatable to be left alone as a direct call, but force
|
| + // other constants like ConstantInteger32 to be in a register and make it an
|
| + // indirect call.
|
| if (!llvm::isa<ConstantRelocatable>(CallTarget)) {
|
| CallTarget = legalize(CallTarget, Legal_Reg);
|
| }
|
| @@ -1915,8 +1901,8 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
|
| if (ReturnRegHi)
|
| Context.insert(InstFakeDef::create(Func, ReturnRegHi));
|
|
|
| - // Add the appropriate offset to SP. The call instruction takes care
|
| - // of resetting the stack offset during emission.
|
| + // Add the appropriate offset to SP. The call instruction takes care of
|
| + // resetting the stack offset during emission.
|
| if (ParameterAreaSizeBytes) {
|
| Operand *AddAmount = legalize(Ctx->getConstantInt32(ParameterAreaSizeBytes),
|
| Legal_Reg | Legal_Flex);
|
| @@ -2024,8 +2010,8 @@ void TargetARM32::lowerCast(const InstCast *Inst) {
|
| Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
|
| Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
|
| Variable *T_Lo = makeReg(DestLo->getType());
|
| - // i32 and i1 can just take up the whole register.
|
| - // i32 doesn't need uxt, while i1 will have an and mask later anyway.
|
| + // i32 and i1 can just take up the whole register. i32 doesn't need uxt,
|
| + // while i1 will have an and mask later anyway.
|
| if (Src0->getType() == IceType_i32 || Src0->getType() == IceType_i1) {
|
| Operand *Src0RF = legalize(Src0, Legal_Reg | Legal_Flex);
|
| _mov(T_Lo, Src0RF);
|
| @@ -2046,9 +2032,9 @@ void TargetARM32::lowerCast(const InstCast *Inst) {
|
| Operand *Src0RF = legalize(Src0, Legal_Reg | Legal_Flex);
|
| Constant *One = Ctx->getConstantInt32(1);
|
| Variable *T = makeReg(Dest->getType());
|
| - // Just use _mov instead of _uxt since all registers are 32-bit.
|
| - // _uxt requires the source to be a register so could have required
|
| - // a _mov from legalize anyway.
|
| + // Just use _mov instead of _uxt since all registers are 32-bit. _uxt
|
| + // requires the source to be a register so could have required a _mov
|
| + // from legalize anyway.
|
| _mov(T, Src0RF);
|
| _and(T, T, One);
|
| _mov(Dest, T);
|
| @@ -2288,8 +2274,8 @@ void TargetARM32::lowerIcmp(const InstIcmp *Inst) {
|
| // mov.<C2> t, #0 mov.<C2> t, #0
|
| // mov a, t mov a, t
|
| // where the "cmp.eq b.lo, c.lo" is used for unsigned and "sbcs t1, hi, hi"
|
| - // is used for signed compares. In some cases, b and c need to be swapped
|
| - // as well.
|
| + // is used for signed compares. In some cases, b and c need to be swapped as
|
| + // well.
|
| //
|
| // LLVM does:
|
| // for EQ and NE:
|
| @@ -2299,13 +2285,13 @@ void TargetARM32::lowerIcmp(const InstIcmp *Inst) {
|
| // mov.<C> t, #1
|
| // mov a, t
|
| //
|
| - // that's nice in that it's just as short but has fewer dependencies
|
| - // for better ILP at the cost of more registers.
|
| + // that's nice in that it's just as short but has fewer dependencies for
|
| + // better ILP at the cost of more registers.
|
| //
|
| - // Otherwise for signed/unsigned <, <=, etc. LLVM uses a sequence with
|
| - // two unconditional mov #0, two cmps, two conditional mov #1,
|
| - // and one conditonal reg mov. That has few dependencies for good ILP,
|
| - // but is a longer sequence.
|
| + // Otherwise for signed/unsigned <, <=, etc. LLVM uses a sequence with two
|
| + // unconditional mov #0, two cmps, two conditional mov #1, and one
|
| + // conditional reg mov. That has few dependencies for good ILP, but is a
|
| + // longer sequence.
|
| //
|
| // So, we are going with the GCC version since it's usually better (except
|
| // perhaps for eq/ne). We could revisit special-casing eq/ne later.
|
| @@ -2333,8 +2319,8 @@ void TargetARM32::lowerIcmp(const InstIcmp *Inst) {
|
| Variable *ScratchReg = makeReg(IceType_i32);
|
| _cmp(Src0Lo, Src1LoRF);
|
| _sbcs(ScratchReg, Src0Hi, Src1HiRF);
|
| - // ScratchReg isn't going to be used, but we need the
|
| - // side-effect of setting flags from this operation.
|
| + // ScratchReg isn't going to be used, but we need the side-effect of
|
| + // setting flags from this operation.
|
| Context.insert(InstFakeUse::create(Func, ScratchReg));
|
| } else {
|
| _cmp(Src0Hi, Src1HiRF);
|
| @@ -2354,8 +2340,8 @@ void TargetARM32::lowerIcmp(const InstIcmp *Inst) {
|
| // mov.C1 t, #0
|
| // mov.C2 t, #1
|
| // mov a, t
|
| - // where the unsigned/sign extension is not needed for 32-bit.
|
| - // They also have special cases for EQ and NE. E.g., for NE:
|
| + // where the unsigned/sign extension is not needed for 32-bit. They also have
|
| + // special cases for EQ and NE. E.g., for NE:
|
| // <extend to tb, tc>
|
| // subs t, tb, tc
|
| // movne t, #1
|
| @@ -2368,13 +2354,13 @@ void TargetARM32::lowerIcmp(const InstIcmp *Inst) {
|
| // mov.<C> t, #1
|
| // mov a, t
|
| //
|
| - // the left shift is by 0, 16, or 24, which allows the comparison to focus
|
| - // on the digits that actually matter (for 16-bit or 8-bit signed/unsigned).
|
| - // For the unsigned case, for some reason it does similar to GCC and does
|
| - // a uxtb first. It's not clear to me why that special-casing is needed.
|
| + // the left shift is by 0, 16, or 24, which allows the comparison to focus on
|
| + // the digits that actually matter (for 16-bit or 8-bit signed/unsigned). For
|
| + // the unsigned case, for some reason it does similar to GCC and does a uxtb
|
| + // first. It's not clear to me why that special-casing is needed.
|
| //
|
| - // We'll go with the LLVM way for now, since it's shorter and has just as
|
| - // few dependencies.
|
| + // We'll go with the LLVM way for now, since it's shorter and has just as few
|
| + // dependencies.
|
| int32_t ShiftAmt = 32 - getScalarIntBitWidth(Src0->getType());
|
| assert(ShiftAmt >= 0);
|
| Constant *ShiftConst = nullptr;
|
| @@ -2417,9 +2403,9 @@ void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
|
| UnimplementedError(Func->getContext()->getFlags());
|
| return;
|
| case Intrinsics::AtomicFenceAll:
|
| - // NOTE: FenceAll should prevent and load/store from being moved
|
| - // across the fence (both atomic and non-atomic). The InstARM32Mfence
|
| - // instruction is currently marked coarsely as "HasSideEffects".
|
| + // NOTE: FenceAll should prevent and load/store from being moved across the
|
| + // fence (both atomic and non-atomic). The InstARM32Mfence instruction is
|
| + // currently marked coarsely as "HasSideEffects".
|
| UnimplementedError(Func->getContext()->getFlags());
|
| return;
|
| case Intrinsics::AtomicIsLockFree: {
|
| @@ -2477,10 +2463,10 @@ void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
|
| Call->addArg(Val);
|
| lowerCall(Call);
|
| // The popcount helpers always return 32-bit values, while the intrinsic's
|
| - // signature matches some 64-bit platform's native instructions and
|
| - // expect to fill a 64-bit reg. Thus, clear the upper bits of the dest
|
| - // just in case the user doesn't do that in the IR or doesn't toss the bits
|
| - // via truncate.
|
| + // signature matches some 64-bit platform's native instructions and expect
|
| + // to fill a 64-bit reg. Thus, clear the upper bits of the dest just in
|
| + // case the user doesn't do that in the IR or doesn't toss the bits via
|
| + // truncate.
|
| if (Val->getType() == IceType_i64) {
|
| Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
|
| Constant *Zero = Ctx->getConstantZero(IceType_i32);
|
| @@ -2491,8 +2477,8 @@ void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
|
| return;
|
| }
|
| case Intrinsics::Ctlz: {
|
| - // The "is zero undef" parameter is ignored and we always return
|
| - // a well-defined value.
|
| + // The "is zero undef" parameter is ignored and we always return a
|
| + // well-defined value.
|
| Operand *Val = Instr->getArg(0);
|
| Variable *ValLoR;
|
| Variable *ValHiR = nullptr;
|
| @@ -2639,9 +2625,9 @@ void TargetARM32::lowerCLZ(Variable *Dest, Variable *ValLoR, Variable *ValHiR) {
|
| Variable *T2 = makeReg(IceType_i32);
|
| _add(T2, T, ThirtyTwo);
|
| _clz(T2, ValHiR, CondARM32::NE);
|
| - // T2 is actually a source as well when the predicate is not AL
|
| - // (since it may leave T2 alone). We use set_dest_nonkillable to
|
| - // prolong the liveness of T2 as if it was used as a source.
|
| + // T2 is actually a source as well when the predicate is not AL (since it
|
| + // may leave T2 alone). We use set_dest_nonkillable to prolong the liveness
|
| + // of T2 as if it was used as a source.
|
| _set_dest_nonkillable();
|
| _mov(DestLo, T2);
|
| Variable *T3 = nullptr;
|
| @@ -2654,15 +2640,14 @@ void TargetARM32::lowerCLZ(Variable *Dest, Variable *ValLoR, Variable *ValHiR) {
|
| }
|
|
|
| void TargetARM32::lowerLoad(const InstLoad *Load) {
|
| - // A Load instruction can be treated the same as an Assign
|
| - // instruction, after the source operand is transformed into an
|
| - // OperandARM32Mem operand.
|
| + // A Load instruction can be treated the same as an Assign instruction, after
|
| + // the source operand is transformed into an OperandARM32Mem operand.
|
| Type Ty = Load->getDest()->getType();
|
| Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
|
| Variable *DestLoad = Load->getDest();
|
|
|
| - // TODO(jvoung): handled folding opportunities. Sign and zero extension
|
| - // can be folded into a load.
|
| + // TODO(jvoung): handled folding opportunities. Sign and zero extension can
|
| + // be folded into a load.
|
| InstAssign *Assign = InstAssign::create(Func, DestLoad, Src0);
|
| lowerAssign(Assign);
|
| }
|
| @@ -2708,17 +2693,15 @@ void TargetARM32::lowerRet(const InstRet *Inst) {
|
| _mov(Reg, Src0F, CondARM32::AL, RegARM32::Reg_r0);
|
| }
|
| }
|
| - // Add a ret instruction even if sandboxing is enabled, because
|
| - // addEpilog explicitly looks for a ret instruction as a marker for
|
| - // where to insert the frame removal instructions.
|
| - // addEpilog is responsible for restoring the "lr" register as needed
|
| - // prior to this ret instruction.
|
| + // Add a ret instruction even if sandboxing is enabled, because addEpilog
|
| + // explicitly looks for a ret instruction as a marker for where to insert the
|
| + // frame removal instructions. addEpilog is responsible for restoring the
|
| + // "lr" register as needed prior to this ret instruction.
|
| _ret(getPhysicalRegister(RegARM32::Reg_lr), Reg);
|
| - // Add a fake use of sp to make sure sp stays alive for the entire
|
| - // function. Otherwise post-call sp adjustments get dead-code
|
| - // eliminated. TODO: Are there more places where the fake use
|
| - // should be inserted? E.g. "void f(int n){while(1) g(n);}" may not
|
| - // have a ret instruction.
|
| + // Add a fake use of sp to make sure sp stays alive for the entire function.
|
| + // Otherwise post-call sp adjustments get dead-code eliminated.
|
| + // TODO: Are there more places where the fake use should be inserted? E.g.
|
| + // "void f(int n){while(1) g(n);}" may not have a ret instruction.
|
| Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
|
| Context.insert(InstFakeUse::create(Func, SP));
|
| }
|
| @@ -2852,8 +2835,8 @@ Variable *TargetARM32::copyToReg(Operand *Src, int32_t RegNum) {
|
| if (isVectorType(Ty) || isFloatingType(Ty)) {
|
| _vmov(Reg, Src);
|
| } else {
|
| - // Mov's Src operand can really only be the flexible second operand type
|
| - // or a register. Users should guarantee that.
|
| + // Mov's Src operand can really only be the flexible second operand type or
|
| + // a register. Users should guarantee that.
|
| _mov(Reg, Src);
|
| }
|
| return Reg;
|
| @@ -2862,18 +2845,17 @@ Variable *TargetARM32::copyToReg(Operand *Src, int32_t RegNum) {
|
| Operand *TargetARM32::legalize(Operand *From, LegalMask Allowed,
|
| int32_t RegNum) {
|
| Type Ty = From->getType();
|
| - // Assert that a physical register is allowed. To date, all calls
|
| - // to legalize() allow a physical register. Legal_Flex converts
|
| - // registers to the right type OperandARM32FlexReg as needed.
|
| + // Assert that a physical register is allowed. To date, all calls to
|
| + // legalize() allow a physical register. Legal_Flex converts registers to the
|
| + // right type OperandARM32FlexReg as needed.
|
| assert(Allowed & Legal_Reg);
|
| - // Go through the various types of operands:
|
| - // OperandARM32Mem, OperandARM32Flex, Constant, and Variable.
|
| - // Given the above assertion, if type of operand is not legal
|
| - // (e.g., OperandARM32Mem and !Legal_Mem), we can always copy
|
| - // to a register.
|
| + // Go through the various types of operands: OperandARM32Mem,
|
| + // OperandARM32Flex, Constant, and Variable. Given the above assertion, if
|
| + // type of operand is not legal (e.g., OperandARM32Mem and !Legal_Mem), we
|
| + // can always copy to a register.
|
| if (auto Mem = llvm::dyn_cast<OperandARM32Mem>(From)) {
|
| - // Before doing anything with a Mem operand, we need to ensure
|
| - // that the Base and Index components are in physical registers.
|
| + // Before doing anything with a Mem operand, we need to ensure that the
|
| + // Base and Index components are in physical registers.
|
| Variable *Base = Mem->getBase();
|
| Variable *Index = Mem->getIndex();
|
| Variable *RegBase = nullptr;
|
| @@ -2918,8 +2900,8 @@ Operand *TargetARM32::legalize(Operand *From, LegalMask Allowed,
|
| if (auto FlexReg = llvm::dyn_cast<OperandARM32FlexReg>(Flex)) {
|
| if (FlexReg->getShiftOp() == OperandARM32::kNoShift) {
|
| From = FlexReg->getReg();
|
| - // Fall through and let From be checked as a Variable below,
|
| - // where it may or may not need a register.
|
| + // Fall through and let From be checked as a Variable below, where it
|
| + // may or may not need a register.
|
| } else {
|
| return copyToReg(Flex, RegNum);
|
| }
|
| @@ -2944,10 +2926,10 @@ Operand *TargetARM32::legalize(Operand *From, LegalMask Allowed,
|
| uint32_t RotateAmt;
|
| uint32_t Immed_8;
|
| uint32_t Value = static_cast<uint32_t>(C32->getValue());
|
| - // Check if the immediate will fit in a Flexible second operand,
|
| - // if a Flexible second operand is allowed. We need to know the exact
|
| - // value, so that rules out relocatable constants.
|
| - // Also try the inverse and use MVN if possible.
|
| + // Check if the immediate will fit in a Flexible second operand, if a
|
| + // Flexible second operand is allowed. We need to know the exact value,
|
| + // so that rules out relocatable constants. Also try the inverse and use
|
| + // MVN if possible.
|
| if (CanBeFlex &&
|
| OperandARM32FlexImm::canHoldImm(Value, &RotateAmt, &Immed_8)) {
|
| return OperandARM32FlexImm::create(Func, Ty, Immed_8, RotateAmt);
|
| @@ -2977,12 +2959,12 @@ Operand *TargetARM32::legalize(Operand *From, LegalMask Allowed,
|
| } else {
|
| assert(isScalarFloatingType(Ty));
|
| // Load floats/doubles from literal pool.
|
| - // TODO(jvoung): Allow certain immediates to be encoded directly in
|
| - // an operand. See Table A7-18 of the ARM manual:
|
| - // "Floating-point modified immediate constants".
|
| - // Or, for 32-bit floating point numbers, just encode the raw bits
|
| - // into a movw/movt pair to GPR, and vmov to an SREG, instead of using
|
| - // a movw/movt pair to get the const-pool address then loading to SREG.
|
| + // TODO(jvoung): Allow certain immediates to be encoded directly in an
|
| + // operand. See Table A7-18 of the ARM manual: "Floating-point modified
|
| + // immediate constants". Or, for 32-bit floating point numbers, just
|
| + // encode the raw bits into a movw/movt pair to GPR, and vmov to an SREG,
|
| + // instead of using a movw/movt pair to get the const-pool address then
|
| + // loading to SREG.
|
| std::string Buffer;
|
| llvm::raw_string_ostream StrBuf(Buffer);
|
| llvm::cast<Constant>(From)->emitPoolLabel(StrBuf);
|
| @@ -2997,9 +2979,9 @@ Operand *TargetARM32::legalize(Operand *From, LegalMask Allowed,
|
| }
|
|
|
| if (auto Var = llvm::dyn_cast<Variable>(From)) {
|
| - // Check if the variable is guaranteed a physical register. This
|
| - // can happen either when the variable is pre-colored or when it is
|
| - // assigned infinite weight.
|
| + // Check if the variable is guaranteed a physical register. This can happen
|
| + // either when the variable is pre-colored or when it is assigned infinite
|
| + // weight.
|
| bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg());
|
| // We need a new physical register for the operand if:
|
| // Mem is not allowed and Var isn't guaranteed a physical
|
| @@ -3025,17 +3007,16 @@ Variable *TargetARM32::legalizeToReg(Operand *From, int32_t RegNum) {
|
| Operand *TargetARM32::legalizeUndef(Operand *From, int32_t RegNum) {
|
| Type Ty = From->getType();
|
| if (llvm::isa<ConstantUndef>(From)) {
|
| - // Lower undefs to zero. Another option is to lower undefs to an
|
| - // uninitialized register; however, using an uninitialized register
|
| - // results in less predictable code.
|
| + // Lower undefs to zero. Another option is to lower undefs to an
|
| + // uninitialized register; however, using an uninitialized register results
|
| + // in less predictable code.
|
| //
|
| - // If in the future the implementation is changed to lower undef
|
| - // values to uninitialized registers, a FakeDef will be needed:
|
| - // Context.insert(InstFakeDef::create(Func, Reg));
|
| - // This is in order to ensure that the live range of Reg is not
|
| - // overestimated. If the constant being lowered is a 64 bit value,
|
| - // then the result should be split and the lo and hi components will
|
| - // need to go in uninitialized registers.
|
| + // If in the future the implementation is changed to lower undef values to
|
| + // uninitialized registers, a FakeDef will be needed:
|
| + // Context.insert(InstFakeDef::create(Func, Reg)); This is in order to
|
| + // ensure that the live range of Reg is not overestimated. If the constant
|
| + // being lowered is a 64 bit value, then the result should be split and the
|
| + // lo and hi components will need to go in uninitialized registers.
|
| if (isVectorType(Ty))
|
| return makeVectorOfZeros(Ty, RegNum);
|
| return Ctx->getConstantZero(Ty);
|
| @@ -3045,15 +3026,15 @@ Operand *TargetARM32::legalizeUndef(Operand *From, int32_t RegNum) {
|
|
|
| OperandARM32Mem *TargetARM32::formMemoryOperand(Operand *Operand, Type Ty) {
|
| OperandARM32Mem *Mem = llvm::dyn_cast<OperandARM32Mem>(Operand);
|
| - // It may be the case that address mode optimization already creates
|
| - // an OperandARM32Mem, so in that case it wouldn't need another level
|
| - // of transformation.
|
| + // It may be the case that address mode optimization already creates an
|
| + // OperandARM32Mem, so in that case it wouldn't need another level of
|
| + // transformation.
|
| if (Mem) {
|
| return llvm::cast<OperandARM32Mem>(legalize(Mem));
|
| }
|
| - // If we didn't do address mode optimization, then we only
|
| - // have a base/offset to work with. ARM always requires a base
|
| - // register, so just use that to hold the operand.
|
| + // If we didn't do address mode optimization, then we only have a base/offset
|
| + // to work with. ARM always requires a base register, so just use that to
|
| + // hold the operand.
|
| Variable *Base = legalizeToReg(Operand);
|
| return OperandARM32Mem::create(
|
| Func, Ty, Base,
|
| @@ -3076,9 +3057,9 @@ void TargetARM32::alignRegisterPow2(Variable *Reg, uint32_t Align) {
|
| uint32_t RotateAmt;
|
| uint32_t Immed_8;
|
| Operand *Mask;
|
| - // Use AND or BIC to mask off the bits, depending on which immediate fits
|
| - // (if it fits at all). Assume Align is usually small, in which case BIC
|
| - // works better. Thus, this rounds down to the alignment.
|
| + // Use AND or BIC to mask off the bits, depending on which immediate fits (if
|
| + // it fits at all). Assume Align is usually small, in which case BIC works
|
| + // better. Thus, this rounds down to the alignment.
|
| if (OperandARM32FlexImm::canHoldImm(Align - 1, &RotateAmt, &Immed_8)) {
|
| Mask = legalize(Ctx->getConstantInt32(Align - 1), Legal_Reg | Legal_Flex);
|
| _bic(Reg, Reg, Mask);
|
| @@ -3170,17 +3151,18 @@ void TargetHeaderARM32::lower() {
|
| OstreamLocker L(Ctx);
|
| Ostream &Str = Ctx->getStrEmit();
|
| Str << ".syntax unified\n";
|
| - // Emit build attributes in format: .eabi_attribute TAG, VALUE.
|
| - // See Sec. 2 of "Addenda to, and Errata in the ABI for the ARM architecture"
|
| - // http://infocenter.arm.com/help/topic/com.arm.doc.ihi0045d/IHI0045D_ABI_addenda.pdf
|
| + // Emit build attributes in format: .eabi_attribute TAG, VALUE. See Sec. 2 of
|
| + // "Addenda to, and Errata in the ABI for the ARM architecture"
|
| + // http://infocenter.arm.com
|
| + // /help/topic/com.arm.doc.ihi0045d/IHI0045D_ABI_addenda.pdf
|
| //
|
| - // Tag_conformance should be be emitted first in a file-scope
|
| - // sub-subsection of the first public subsection of the attributes.
|
| + // Tag_conformance should be be emitted first in a file-scope sub-subsection
|
| + // of the first public subsection of the attributes.
|
| Str << ".eabi_attribute 67, \"2.09\" @ Tag_conformance\n";
|
| - // Chromebooks are at least A15, but do A9 for higher compat.
|
| - // For some reason, the LLVM ARM asm parser has the .cpu directive override
|
| - // the mattr specified on the commandline. So to test hwdiv, we need to set
|
| - // the .cpu directive higher (can't just rely on --mattr=...).
|
| + // Chromebooks are at least A15, but do A9 for higher compat. For some
|
| + // reason, the LLVM ARM asm parser has the .cpu directive override the mattr
|
| + // specified on the commandline. So to test hwdiv, we need to set the .cpu
|
| + // directive higher (can't just rely on --mattr=...).
|
| if (CPUFeatures.hasFeature(TargetARM32Features::HWDivArm)) {
|
| Str << ".cpu cortex-a15\n";
|
| } else {
|
|
|