Index: src/IceCfgNode.cpp |
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp |
index a1804d5cdf4a30f477820d25b80ef6d1a7b7bb9e..925500ed9c15c5b17dd547299b0fe7e52284dc64 100644 |
--- a/src/IceCfgNode.cpp |
+++ b/src/IceCfgNode.cpp |
@@ -24,7 +24,7 @@ namespace Ice { |
CfgNode::CfgNode(Cfg *Func, SizeT LabelNumber, IceString Name) |
: Func(Func), Number(LabelNumber), Name(Name), HasReturn(false), |
- InstCountEstimate(0) {} |
+ NeedsPlacement(false), InstCountEstimate(0) {} |
// Returns the name the node was created with. If no name was given, |
// it synthesizes a (hopefully) unique name. |
@@ -204,6 +204,269 @@ void CfgNode::deletePhis() { |
I->setDeleted(); |
} |
+// Splits the edge from Pred to this node by creating a new node and |
+// hooking up the in and out edges appropriately. (The EdgeIndex |
+// parameter is only used to make the new node's name unique when |
+// there are multiple edges between the same pair of nodes.) The new |
+// node's instruction list is initialized to the empty list, with no |
+// terminator instruction. If there are multiple edges from Pred to |
+// this node, only one edge is split, and the particular choice of |
+// edge is undefined. This could happen with a switch instruction, or |
+// a conditional branch that weirdly has both branches to the same |
+// place. TODO(stichnot,kschimpf): Figure out whether this is legal |
+// in the LLVM IR or the PNaCl bitcode, and if so, we need to |
+// establish a strong relationship among the ordering of Pred's |
+// out-edge list, this node's in-edge list, and the Phi instruction's |
jvoung (off chromium)
2014/10/27 22:12:20
So if this happens, the phi lowering might be wron
Jim Stichnoth
2014/10/28 01:20:14
After thinking about this, my belief is that if th
|
+// operand list. |
+CfgNode *CfgNode::splitIncomingEdge(CfgNode *Pred, SizeT EdgeIndex) { |
+ CfgNode *NewNode = |
+ Func->makeNode("split_" + Pred->getName() + "_" + getName() + "_" + |
+ std::to_string(EdgeIndex)); |
+ // The new node is added to the end of the node list, and will later |
+ // need to be sorted into a reasonable topological order. |
+ NewNode->setNeedsPlacement(true); |
+ // Repoint Pred's out-edge. |
+ bool Found = false; |
+ for (auto I = Pred->OutEdges.begin(), E = Pred->OutEdges.end(); |
+ !Found && I != E; ++I) { |
+ if (*I == this) { |
+ *I = NewNode; |
+ NewNode->InEdges.push_back(Pred); |
+ Found = true; |
+ } |
+ } |
+ assert(Found); |
+ // Repoint this node's in-edge. |
+ Found = false; |
+ for (auto I = InEdges.begin(), E = InEdges.end(); !Found && I != E; ++I) { |
+ if (*I == Pred) { |
+ *I = NewNode; |
+ NewNode->OutEdges.push_back(this); |
+ Found = true; |
+ } |
+ } |
+ assert(Found); |
+ // Repoint a suitable branch instruction's target. |
+ Found = false; |
+ for (auto I = Pred->getInsts().rbegin(), E = Pred->getInsts().rend(); |
+ !Found && I != E; ++I) { |
+ if (!(*I)->isDeleted()) { |
+ Found = (*I)->repointEdge(this, NewNode); |
+ } |
+ } |
+ assert(Found); |
+ return NewNode; |
+} |
+ |
+namespace { |
+ |
+// Helper function used by advancedPhiLowering(). |
+bool sameVarOrReg(const Variable *Var, const Operand *Opnd) { |
+ if (Var == Opnd) |
+ return true; |
+ if (const auto Var2 = llvm::dyn_cast<Variable>(Opnd)) { |
+ if (Var->hasReg() && Var->getRegNum() == Var2->getRegNum()) |
+ return true; |
+ } |
+ return false; |
+} |
+ |
+} // end of anonymous namespace |
+ |
+// This the "advanced" version of Phi lowering for a basic block, in |
+// contrast to the simple version that lowers through assignments |
+// involving temporaries. |
+// |
+// All Phi instructions in a basic block are conceptually executed in |
+// parallel. However, if we lower Phis early and commit to a |
+// sequential ordering, we may end up creating unnecessary |
+// interferences which lead to worse register allocation. Delaying |
+// Phi scheduling until after register allocation can helps unless |
jvoung (off chromium)
2014/10/27 22:12:21
"can helps" -> "can help"
Jim Stichnoth
2014/10/28 01:20:14
Done.
|
+// there are no free registers for shuffling registers or stack slots |
+// and spilling becomes necessary. |
+// |
+// The advanced Phi lowering starts by finding a topological sort of |
+// the Phi instructions, where "A=B" comes before "B=C" due to the |
+// anti-dependence on B. If a topological sort is not possible due to |
+// a cycle, the cycle is broken by introducing a non-parallel |
+// temporary. For example, a cycle arising from a permutation like |
+// "A=B;B=C;C=A" can become "T=A;A=B;B=C;C=T". All else being equal, |
+// prefer to schedule assignments with register-allocated Src operands |
+// earlier, in case that register becomes free afterwards; and prefer |
jvoung (off chromium)
2014/10/27 22:12:21
", and" instead of "; and" ?
Jim Stichnoth
2014/10/28 01:20:14
Done.
|
+// to schedule assignments with register-allocated Dest variables |
+// later, to keep that register free for longer. |
+// |
+// Once the ordering is determined, the Cfg edge is split and the |
+// assignment list is lowered by the target lowering layer. The |
+// specific placement of the new node within the Cfg node list is |
+// deferred until later, including after empty node contraction. |
+void CfgNode::advancedPhiLowering() { |
+ if (getPhis().empty()) |
+ return; |
+ |
+ // Count the number of non-deleted Phi instructions. |
+ struct { |
+ InstPhi *Phi; |
+ Variable *Dest; |
+ Operand *Src; |
+ bool Processed; |
+ size_t NumPred; // number of entries whose Src is this Dest |
+ int32_t Weight; // preference for topological order |
+ } Desc[getPhis().size()]; |
+ |
+ size_t NumPhis = 0; |
+ for (InstPhi *Inst : getPhis()) { |
+ if (!Inst->isDeleted()) { |
+ Desc[NumPhis].Phi = Inst; |
+ Desc[NumPhis].Dest = Inst->getDest(); |
+ ++NumPhis; |
+ } |
+ } |
+ if (NumPhis == 0) |
+ return; |
+ |
+ SizeT InEdgeIndex = 0; |
+ for (CfgNode *Pred : InEdges) { |
+ CfgNode *Split = splitIncomingEdge(Pred, InEdgeIndex++); |
+ AssignList Assignments; |
+ |
+ // First pass computes Src and initializes NumPred. |
+ for (size_t I = 0; I < NumPhis; ++I) { |
+ Desc[I].Src = Desc[I].Phi->getOperandForTarget(Pred); |
+ Desc[I].Processed = false; |
+ Desc[I].NumPred = 0; |
+ } |
+ // Second pass computes NumPred by comparing every pair of Phi |
+ // instructions. |
+ for (size_t I = 0; I < NumPhis; ++I) { |
+ if (Desc[I].Processed) |
jvoung (off chromium)
2014/10/27 22:12:21
It wasn't clear to me why Processed would be true
Jim Stichnoth
2014/10/28 01:20:14
You're right - I think I over-conservatively added
Jim Stichnoth
2014/10/30 00:25:25
BTW, these checks are now back after moving the re
|
+ continue; |
+ const Variable *Dest = Desc[I].Dest; |
+ for (size_t J = 0; J < NumPhis; ++J) { |
+ if (Desc[J].Processed) |
+ continue; |
+ if (I != J) { |
+ // There shouldn't be two Phis with the same Dest variable |
+ // or register. |
+ assert(!sameVarOrReg(Dest, Desc[J].Dest)); |
+ } |
+ const Operand *Src = Desc[J].Src; |
+ if (sameVarOrReg(Dest, Src)) |
+ ++Desc[I].NumPred; |
+ } |
+ } |
+ |
+ // Another pass to compute initial Weight values. |
+ |
+ // Always pick NumPred=0 over NumPred>0. |
+ const int32_t WeightNoPreds = 4; |
+ // Prefer Src as a register because the register might free up. |
+ const int32_t WeightSrcIsReg = 2; |
+ // Prefer Dest not as a register because the register stays free |
+ // longer. |
+ const int32_t WeightDestNotReg = 1; |
+ |
+ for (size_t I = 0; I < NumPhis; ++I) { |
+ if (Desc[I].Processed) |
+ continue; |
+ int32_t Weight = 0; |
+ if (Desc[I].NumPred == 0) |
+ Weight += WeightNoPreds; |
+ if (auto Var = llvm::dyn_cast<Variable>(Desc[I].Src)) |
+ if (Var->hasReg()) |
+ Weight += WeightSrcIsReg; |
+ if (!Desc[I].Dest->hasReg()) |
+ Weight += WeightDestNotReg; |
+ Desc[I].Weight = Weight; |
+ } |
+ |
+ // Repeatedly choose and process the best candidate in the |
+ // topological sort, until no candidates remain. This |
+ // implementation is O(N^2) where N is the number of Phi |
+ // instructions, but with a small constant factor compared to a |
+ // likely implementation of O(N) topological sort. |
+ for (size_t Remaining = NumPhis; Remaining; --Remaining) { |
+ size_t BestIndex = 0; |
+ int32_t BestWeight = -1; |
+ // Find the best candidate. |
+ for (size_t I = 0; I < NumPhis; ++I) { |
+ if (Desc[I].Processed) |
+ continue; |
+ int32_t Weight = 0; |
+ Weight = Desc[I].Weight; |
+ if (Weight > BestWeight) { |
+ BestIndex = I; |
+ BestWeight = Weight; |
+ } |
+ } |
+ assert(BestWeight >= 0); |
+ assert(Desc[BestIndex].NumPred <= 1); |
+ Variable *Dest = Desc[BestIndex].Dest; |
+ Operand *Src = Desc[BestIndex].Src; |
+ // Break a cycle by introducing a temporary. Except don't |
+ // bother for a self-assignment, which looks like a self-cycle |
+ // with NumPred=1, since it can be left alone and will |
+ // eventually be removed as a redundant assignment. |
+ bool IsSelfAssignment = sameVarOrReg(Dest, Src); |
+ if (!IsSelfAssignment && Desc[BestIndex].NumPred) { |
+ bool Found = false; |
+ // If the target instruction "A=B" is part of a cycle, find |
+ // the "X=A" assignment in the cycle because it will have to |
+ // be rewritten as "X=tmp". |
+ for (size_t J = 0; !Found && J < NumPhis; ++J) { |
+ if (Desc[J].Processed) |
+ continue; |
+ Operand *OtherSrc = Desc[J].Src; |
+ if (Desc[J].NumPred && sameVarOrReg(Dest, OtherSrc)) { |
+ // Set Tmp's optional Name parameter to something like |
jvoung (off chromium)
2014/10/27 22:12:21
Did you want to do the name-setting here, or mark
Jim Stichnoth
2014/10/28 01:20:14
I meant it sort of as a weak TODO. But thinking a
|
+ // "__split" + std::to_string(Func->getNumVariables()) |
+ // for easier identification during debugging if desired. |
+ Variable *Tmp = Func->makeVariable(OtherSrc->getType()); |
+ Tmp->setNeedsStackSlot(); |
+ Assignments.push_back(InstAssign::create(Func, Tmp, OtherSrc)); |
+ Desc[J].Src = Tmp; |
+ Found = true; |
+ } |
+ } |
+ assert(Found); |
+ } |
+ // Now that a cycle (if any) has been broken, create the actual |
+ // assignment. Except if Dest and Src are syntactically the |
+ // same, don't bother adding the assignment, because in all |
+ // respects it would be redundant, and if Dest/Src are on the |
+ // stack, the target lowering may naively decide to lower it |
+ // using a temporary register. |
+ if (Dest != Src) |
+ Assignments.push_back(InstAssign::create(Func, Dest, Src)); |
+ // Update NumPred for all Phi assignments using this Phi's Src |
+ // as their Dest variable. Also update Weight if NumPred |
+ // dropped from 1 to 0. |
+ if (Variable *Var = llvm::dyn_cast<Variable>(Src)) { |
jvoung (off chromium)
2014/10/27 22:12:21
nit: could auto this too
Jim Stichnoth
2014/10/28 01:20:14
Done.
|
+ for (size_t I = 0; I < NumPhis; ++I) { |
+ if (Desc[I].Processed) |
+ continue; |
+ if (sameVarOrReg(Var, Desc[I].Dest)) { |
+ if (--Desc[I].NumPred == 0) |
+ Desc[I].Weight += WeightNoPreds; |
+ } |
+ } |
+ } |
+ Desc[BestIndex].Processed = true; |
+ } |
+ |
+ Func->getTarget()->lowerPhiAssignments(Split, Assignments); |
+ |
+ // Renumber the instructions to be monotonically increasing so |
+ // that addNode() doesn't assert when multi-definitions are added |
+ // out of order. |
+ Split->renumberInstructions(); |
+ Func->getVMetadata()->addNode(Split); |
+ } |
+ |
+ for (InstPhi *Inst : getPhis()) |
+ Inst->setDeleted(); |
+} |
+ |
// Does address mode optimization. Pass each instruction to the |
// TargetLowering object. If it returns a new instruction |
// (representing the optimized address mode), then insert the new |
@@ -240,7 +503,7 @@ void CfgNode::doNopInsertion() { |
void CfgNode::genCode() { |
TargetLowering *Target = Func->getTarget(); |
LoweringContext &Context = Target->getContext(); |
- // Lower only the regular instructions. Defer the Phi instructions. |
+ // Lower the regular instructions. |
Context.init(this); |
while (!Context.atEnd()) { |
InstList::iterator Orig = Context.getCur(); |
@@ -250,6 +513,8 @@ void CfgNode::genCode() { |
// Ensure target lowering actually moved the cursor. |
assert(Context.getCur() != Orig); |
} |
+ // Do preliminary lowering of the Phi instructions. |
+ Target->prelowerPhis(); |
} |
void CfgNode::livenessLightweight() { |
@@ -300,7 +565,6 @@ bool CfgNode::liveness(Liveness *Liveness) { |
Liveness->getLiveOut(this) = Live; |
// Process regular instructions in reverse order. |
- // TODO(stichnot): Use llvm::make_range with LLVM 3.5. |
for (auto I = Insts.rbegin(), E = Insts.rend(); I != E; ++I) { |
if ((*I)->isDeleted()) |
continue; |
@@ -309,13 +573,15 @@ bool CfgNode::liveness(Liveness *Liveness) { |
// Process phis in forward order so that we can override the |
// instruction number to be that of the earliest phi instruction in |
// the block. |
+ SizeT NumNonDeadPhis = 0; |
InstNumberT FirstPhiNumber = Inst::NumberSentinel; |
for (InstPhi *I : Phis) { |
if (I->isDeleted()) |
continue; |
if (FirstPhiNumber == Inst::NumberSentinel) |
FirstPhiNumber = I->getNumber(); |
- I->liveness(FirstPhiNumber, Live, Liveness, LiveBegin, LiveEnd); |
+ if (I->liveness(FirstPhiNumber, Live, Liveness, LiveBegin, LiveEnd)) |
+ ++NumNonDeadPhis; |
} |
// When using the sparse representation, after traversing the |
@@ -350,9 +616,12 @@ bool CfgNode::liveness(Liveness *Liveness) { |
// Add in current LiveIn |
Live |= LiveIn; |
// Check result, set LiveIn=Live |
- Changed = (Live != LiveIn); |
- if (Changed) |
+ SizeT &PrevNumNonDeadPhis = Liveness->getNumNonDeadPhis(this); |
+ bool LiveInChanged = (Live != LiveIn); |
+ Changed = (NumNonDeadPhis != PrevNumNonDeadPhis || LiveInChanged); |
+ if (LiveInChanged) |
LiveIn = Live; |
+ PrevNumNonDeadPhis = NumNonDeadPhis; |
return Changed; |
} |
@@ -472,6 +741,40 @@ void CfgNode::livenessPostprocess(LivenessMode Mode, Liveness *Liveness) { |
} |
} |
+// If this node contains only deleted instructions, and ends in an |
+// unconditional branch, contract the node by repointing all its |
+// in-edges to its successor. |
+void CfgNode::contractIfEmpty() { |
+ if (InEdges.size() == 0) |
+ return; |
+ Inst *Branch = NULL; |
+ for (Inst *I : Insts) { |
+ if (!I->isDeleted() && !I->isUnconditionalBranch()) |
+ return; |
+ Branch = I; |
+ } |
+ Branch->setDeleted(); |
+ assert(OutEdges.size() == 1); |
+ // Repoint all this node's in-edges to this node's successor. |
+ for (CfgNode *Pred : InEdges) { |
+ for (auto I = Pred->OutEdges.begin(), E = Pred->OutEdges.end(); I != E; |
+ ++I) { |
+ if (*I == this) { |
+ *I = OutEdges[0]; |
+ OutEdges[0]->InEdges.push_back(Pred); |
+ } |
+ } |
+ for (Inst *I : Pred->getInsts()) { |
+ if (!I->isDeleted()) |
+ I->repointEdge(this, OutEdges[0]); |
jvoung (off chromium)
2014/10/27 22:12:20
Could break out of the loop here too, once the edg
Jim Stichnoth
2014/10/28 01:20:14
E.g., there could be a lowered Switch instruction
jvoung (off chromium)
2014/10/29 13:46:50
Ah okay... I was looking at the way splitIncomingE
|
+ } |
+ } |
+ InEdges.clear(); |
+ // Don't bother removing the single out-edge, which would also |
+ // require finding the corresponding in-edge in the successor and |
+ // removing it. |
+} |
+ |
void CfgNode::doBranchOpt(const CfgNode *NextNode) { |
TargetLowering *Target = Func->getTarget(); |
// Check every instruction for a branch optimization opportunity. |
@@ -479,8 +782,11 @@ void CfgNode::doBranchOpt(const CfgNode *NextNode) { |
// first opportunity, unless there is some target lowering where we |
// have the possibility of multiple such optimizations per block |
// (currently not the case for x86 lowering). |
- for (Inst *I : Insts) |
- Target->doBranchOpt(I, NextNode); |
+ for (Inst *I : Insts) { |
+ if (!I->isDeleted()) { |
+ Target->doBranchOpt(I, NextNode); |
+ } |
+ } |
} |
// ======================== Dump routines ======================== // |