| Index: src/IceTargetLoweringX8632.cpp | 
| diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp | 
| index bdae39451bf3e727e8d775d93c5f815cf42bb96a..45d68921d0c36cf421538f6d3b16deb2c05b9cf9 100644 | 
| --- a/src/IceTargetLoweringX8632.cpp | 
| +++ b/src/IceTargetLoweringX8632.cpp | 
| @@ -482,6 +482,7 @@ void TargetX8632::translateO2() { | 
| return; | 
| Func->dump("After x86 address mode opt"); | 
|  | 
| +  doLoadOpt(); | 
| Func->genCode(); | 
| if (Func->hasError()) | 
| return; | 
| @@ -572,6 +573,126 @@ void TargetX8632::translateOm1() { | 
| } | 
| } | 
|  | 
| +namespace { | 
| + | 
| +// Converts a ConstantInteger32 operand into its constant value, or | 
| +// MemoryOrderInvalid if the operand is not a ConstantInteger32. | 
| +uint64_t getConstantMemoryOrder(Operand *Opnd) { | 
| +  if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd)) | 
| +    return Integer->getValue(); | 
| +  return Intrinsics::MemoryOrderInvalid; | 
| +} | 
| + | 
| +// Determines whether the dest of a Load instruction can be folded | 
| +// into one of the src operands of a 2-operand instruction.  This is | 
| +// true as long as the load dest matches exactly one of the binary | 
| +// instruction's src operands.  Replaces Src0 or Src1 with LoadSrc if | 
| +// the answer is true. | 
| +bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest, | 
| +                               Operand *&Src0, Operand *&Src1) { | 
| +  if (Src0 == LoadDest && Src1 != LoadDest) { | 
| +    Src0 = LoadSrc; | 
| +    return true; | 
| +  } | 
| +  if (Src0 != LoadDest && Src1 == LoadDest) { | 
| +    Src1 = LoadSrc; | 
| +    return true; | 
| +  } | 
| +  return false; | 
| +} | 
| + | 
| +} // end of anonymous namespace | 
| + | 
| +void TargetX8632::doLoadOpt() { | 
| +  for (CfgNode *Node : Func->getNodes()) { | 
| +    Context.init(Node); | 
| +    while (!Context.atEnd()) { | 
| +      Variable *LoadDest = nullptr; | 
| +      Operand *LoadSrc = nullptr; | 
| +      Inst *CurInst = Context.getCur(); | 
| +      Inst *Next = Context.getNextInst(); | 
| +      // Determine whether the current instruction is a Load | 
| +      // instruction or equivalent. | 
| +      if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) { | 
| +        // An InstLoad always qualifies. | 
| +        LoadDest = Load->getDest(); | 
| +        const bool DoLegalize = false; | 
| +        LoadSrc = formMemoryOperand(Load->getSourceAddress(), | 
| +                                    LoadDest->getType(), DoLegalize); | 
| +      } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) { | 
| +        // An AtomicLoad intrinsic qualifies as long as it has a valid | 
| +        // memory ordering, and can be implemented in a single | 
| +        // instruction (i.e., not i64). | 
| +        Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID; | 
| +        if (ID == Intrinsics::AtomicLoad && | 
| +            Intrin->getDest()->getType() != IceType_i64 && | 
| +            Intrinsics::isMemoryOrderValid( | 
| +                ID, getConstantMemoryOrder(Intrin->getArg(1)))) { | 
| +          LoadDest = Intrin->getDest(); | 
| +          const bool DoLegalize = false; | 
| +          LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(), | 
| +                                      DoLegalize); | 
| +        } | 
| +      } | 
| +      // A Load instruction can be folded into the following | 
| +      // instruction only if the following instruction ends the Load's | 
| +      // Dest variable's live range. | 
| +      if (LoadDest && Next && Next->isLastUse(LoadDest)) { | 
| +        assert(LoadSrc); | 
| +        Inst *NewInst = nullptr; | 
| +        if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) { | 
| +          Operand *Src0 = Arith->getSrc(0); | 
| +          Operand *Src1 = Arith->getSrc(1); | 
| +          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { | 
| +            NewInst = InstArithmetic::create(Func, Arith->getOp(), | 
| +                                             Arith->getDest(), Src0, Src1); | 
| +          } | 
| +        } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) { | 
| +          Operand *Src0 = Icmp->getSrc(0); | 
| +          Operand *Src1 = Icmp->getSrc(1); | 
| +          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { | 
| +            NewInst = InstIcmp::create(Func, Icmp->getCondition(), | 
| +                                       Icmp->getDest(), Src0, Src1); | 
| +          } | 
| +        } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) { | 
| +          Operand *Src0 = Fcmp->getSrc(0); | 
| +          Operand *Src1 = Fcmp->getSrc(1); | 
| +          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { | 
| +            NewInst = InstFcmp::create(Func, Fcmp->getCondition(), | 
| +                                       Fcmp->getDest(), Src0, Src1); | 
| +          } | 
| +        } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) { | 
| +          Operand *Src0 = Select->getTrueOperand(); | 
| +          Operand *Src1 = Select->getFalseOperand(); | 
| +          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) { | 
| +            NewInst = InstSelect::create(Func, Select->getDest(), | 
| +                                         Select->getCondition(), Src0, Src1); | 
| +          } | 
| +        } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) { | 
| +          // The load dest can always be folded into a Cast | 
| +          // instruction. | 
| +          Variable *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0)); | 
| +          if (Src0 == LoadDest) { | 
| +            NewInst = InstCast::create(Func, Cast->getCastKind(), | 
| +                                       Cast->getDest(), LoadSrc); | 
| +          } | 
| +        } | 
| +        if (NewInst) { | 
| +          CurInst->setDeleted(); | 
| +          Next->setDeleted(); | 
| +          Context.insert(NewInst); | 
| +          // Update NewInst->LiveRangesEnded so that target lowering | 
| +          // may benefit.  Also update NewInst->HasSideEffects. | 
| +          NewInst->spliceLivenessInfo(Next, CurInst); | 
| +        } | 
| +      } | 
| +      Context.advanceCur(); | 
| +      Context.advanceNext(); | 
| +    } | 
| +  } | 
| +  Func->dump("After load optimization"); | 
| +} | 
| + | 
| bool TargetX8632::doBranchOpt(Inst *I, const CfgNode *NextNode) { | 
| if (InstX8632Br *Br = llvm::dyn_cast<InstX8632Br>(I)) { | 
| return Br->optimizeBranch(NextNode); | 
| @@ -804,15 +925,15 @@ void TargetX8632::addProlog(CfgNode *Node) { | 
| // that stack slot. | 
| std::function<bool(Variable *)> TargetVarHook = | 
| [&VariablesLinkedToSpillSlots](Variable *Var) { | 
| -    if (SpillVariable *SpillVar = llvm::dyn_cast<SpillVariable>(Var)) { | 
| -      assert(Var->getWeight().isZero()); | 
| -      if (SpillVar->getLinkedTo() && !SpillVar->getLinkedTo()->hasReg()) { | 
| -        VariablesLinkedToSpillSlots.push_back(Var); | 
| -        return true; | 
| -      } | 
| -    } | 
| -    return false; | 
| -  }; | 
| +        if (SpillVariable *SpillVar = llvm::dyn_cast<SpillVariable>(Var)) { | 
| +          assert(Var->getWeight().isZero()); | 
| +          if (SpillVar->getLinkedTo() && !SpillVar->getLinkedTo()->hasReg()) { | 
| +            VariablesLinkedToSpillSlots.push_back(Var); | 
| +            return true; | 
| +          } | 
| +        } | 
| +        return false; | 
| +      }; | 
|  | 
| // Compute the list of spilled variables and bounds for GlobalsSize, etc. | 
| getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize, | 
| @@ -1170,6 +1291,10 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { | 
| Variable *Dest = Inst->getDest(); | 
| Operand *Src0 = legalize(Inst->getSrc(0)); | 
| Operand *Src1 = legalize(Inst->getSrc(1)); | 
| +  if (Inst->isCommutative()) { | 
| +    if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) | 
| +      std::swap(Src0, Src1); | 
| +  } | 
| if (Dest->getType() == IceType_i64) { | 
| Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); | 
| Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest)); | 
| @@ -2891,18 +3016,6 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) { | 
| } | 
| } | 
|  | 
| -namespace { | 
| - | 
| -// Converts a ConstantInteger32 operand into its constant value, or | 
| -// MemoryOrderInvalid if the operand is not a ConstantInteger32. | 
| -uint64_t getConstantMemoryOrder(Operand *Opnd) { | 
| -  if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd)) | 
| -    return Integer->getValue(); | 
| -  return Intrinsics::MemoryOrderInvalid; | 
| -} | 
| - | 
| -} // end of anonymous namespace | 
| - | 
| void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) { | 
| switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID) { | 
| case Intrinsics::AtomicCmpxchg: { | 
| @@ -3006,10 +3119,11 @@ void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) { | 
| Func->setError("Unexpected memory ordering for AtomicRMW"); | 
| return; | 
| } | 
| -    lowerAtomicRMW(Instr->getDest(), | 
| -                   static_cast<uint32_t>(llvm::cast<ConstantInteger32>( | 
| -                                             Instr->getArg(0))->getValue()), | 
| -                   Instr->getArg(1), Instr->getArg(2)); | 
| +    lowerAtomicRMW( | 
| +        Instr->getDest(), | 
| +        static_cast<uint32_t>( | 
| +            llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()), | 
| +        Instr->getArg(1), Instr->getArg(2)); | 
| return; | 
| case Intrinsics::AtomicStore: { | 
| if (!Intrinsics::isMemoryOrderValid( | 
| @@ -3852,66 +3966,9 @@ void TargetX8632::lowerLoad(const InstLoad *Load) { | 
| // OperandX8632Mem operand.  Note that the address mode | 
| // optimization already creates an OperandX8632Mem operand, so it | 
| // doesn't need another level of transformation. | 
| -  Type Ty = Load->getDest()->getType(); | 
| -  Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty); | 
| - | 
| -  // Fuse this load with a subsequent Arithmetic instruction in the | 
| -  // following situations: | 
| -  //   a=[mem]; c=b+a ==> c=b+[mem] if last use of a and a not in b | 
| -  //   a=[mem]; c=a+b ==> c=b+[mem] if commutative and above is true | 
| -  // | 
| -  // Fuse this load with a subsequent Cast instruction: | 
| -  //   a=[mem]; b=cast(a) ==> b=cast([mem]) if last use of a | 
| -  // | 
| -  // TODO: Clean up and test thoroughly. | 
| -  // (E.g., if there is an mfence-all make sure the load ends up on the | 
| -  // same side of the fence). | 
| -  // | 
| -  // TODO: Why limit to Arithmetic instructions?  This could probably be | 
| -  // applied to most any instruction type.  Look at all source operands | 
| -  // in the following instruction, and if there is one instance of the | 
| -  // load instruction's dest variable, and that instruction ends that | 
| -  // variable's live range, then make the substitution.  Deal with | 
| -  // commutativity optimization in the arithmetic instruction lowering. | 
| -  // | 
| -  // TODO(stichnot): Do load fusing as a separate pass.  Run it before | 
| -  // the bool folding pass.  Modify Ice::Inst to allow src operands to | 
| -  // be replaced, including updating Inst::LiveRangesEnded, to avoid | 
| -  // having to manually mostly clone each instruction type. | 
| -  Inst *NextInst = Context.getNextInst(); | 
| Variable *DestLoad = Load->getDest(); | 
| -  if (NextInst && NextInst->isLastUse(DestLoad)) { | 
| -    if (auto *Arith = llvm::dyn_cast<InstArithmetic>(NextInst)) { | 
| -      InstArithmetic *NewArith = nullptr; | 
| -      Variable *Src0Arith = llvm::dyn_cast<Variable>(Arith->getSrc(0)); | 
| -      Variable *Src1Arith = llvm::dyn_cast<Variable>(Arith->getSrc(1)); | 
| -      if (Src1Arith == DestLoad && DestLoad != Src0Arith) { | 
| -        NewArith = InstArithmetic::create( | 
| -            Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(0), Src0); | 
| -      } else if (Src0Arith == DestLoad && Arith->isCommutative() && | 
| -                 DestLoad != Src1Arith) { | 
| -        NewArith = InstArithmetic::create( | 
| -            Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(1), Src0); | 
| -      } | 
| -      if (NewArith) { | 
| -        Arith->setDeleted(); | 
| -        Context.advanceNext(); | 
| -        lowerArithmetic(NewArith); | 
| -        return; | 
| -      } | 
| -    } else if (auto *Cast = llvm::dyn_cast<InstCast>(NextInst)) { | 
| -      Variable *Src0Cast = llvm::dyn_cast<Variable>(Cast->getSrc(0)); | 
| -      if (Src0Cast == DestLoad) { | 
| -        InstCast *NewCast = | 
| -            InstCast::create(Func, Cast->getCastKind(), Cast->getDest(), Src0); | 
| -        Cast->setDeleted(); | 
| -        Context.advanceNext(); | 
| -        lowerCast(NewCast); | 
| -        return; | 
| -      } | 
| -    } | 
| -  } | 
| - | 
| +  Type Ty = DestLoad->getType(); | 
| +  Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty); | 
| InstAssign *Assign = InstAssign::create(Func, DestLoad, Src0); | 
| lowerAssign(Assign); | 
| } | 
| @@ -4639,7 +4696,8 @@ Operand *TargetX8632::legalizeSrc0ForCmp(Operand *Src0, Operand *Src1) { | 
| return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg); | 
| } | 
|  | 
| -OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Operand, Type Ty) { | 
| +OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Operand, Type Ty, | 
| +                                                bool DoLegalize) { | 
| OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand); | 
| // It may be the case that address mode optimization already creates | 
| // an OperandX8632Mem, so in that case it wouldn't need another level | 
| @@ -4656,7 +4714,7 @@ OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Operand, Type Ty) { | 
| } | 
| Mem = OperandX8632Mem::create(Func, Ty, Base, Offset); | 
| } | 
| -  return llvm::cast<OperandX8632Mem>(legalize(Mem)); | 
| +  return llvm::cast<OperandX8632Mem>(DoLegalize ? legalize(Mem) : Mem); | 
| } | 
|  | 
| Variable *TargetX8632::makeReg(Type Type, int32_t RegNum) { | 
|  |