Index: src/IceTargetLoweringARM32.cpp |
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp |
index f23609b7837cce25402e4d1382ffa9741273d02c..7eb4e8ad446f540ef10a4f12aa9d8c6cb4a29e33 100644 |
--- a/src/IceTargetLoweringARM32.cpp |
+++ b/src/IceTargetLoweringARM32.cpp |
@@ -32,6 +32,7 @@ |
#include "llvm/Support/MathExtras.h" |
#include <algorithm> |
+#include <array> |
#include <utility> |
namespace Ice { |
@@ -2036,6 +2037,154 @@ void TargetARM32::lowerInt64Arithmetic(InstArithmetic::OpKind Op, |
} |
} |
+namespace { |
+// StrengthReduction is a namespace with the strength reduction machinery. The |
+// entry point is the StrengthReduction::tryToOptimize method. It returns true |
+// if the optimization can be performed, and false otherwise. |
+// |
+// If the optimization can be performed, tryToOptimize sets its NumOperations |
+// parameter to the number of shifts that are needed to perform the |
+// multiplication; and it sets the Operations parameter with <ShAmt, AddOrSub> |
+// tuples that describe how to materialize the multiplication. |
+// |
+// The algorithm finds contiguous 1s in the Multiplication source, and uses one |
+// or two shifts to materialize it. A sequence of 1s, e.g., |
+// |
+// M N |
+// ...00000000000011111...111110000000... |
+// |
+// is materializable with (1 << (M + 1)) - (1 << N): |
+// |
+// ...00000000000100000...000000000000... [1 << (M + 1)] |
+// ...00000000000000000...000010000000... (-) [1 << N] |
+// -------------------------------------- |
+// ...00000000000011111...111110000000... |
+// |
+// And a single bit set, which is just a left shift. |
+namespace StrengthReduction { |
+enum AggregationOperation { |
+ AO_Invalid, |
+ AO_Add, |
+ AO_Sub, |
+}; |
+ |
+// AggregateElement is a glorified <ShAmt, AddOrSub> tuple. |
+class AggregationElement { |
+ AggregationElement(const AggregationElement &) = delete; |
+ |
+public: |
+ AggregationElement() = default; |
+ AggregationElement &operator=(const AggregationElement &) = default; |
+ AggregationElement(AggregationOperation Op, uint32_t ShAmt) |
+ : Op(Op), ShAmt(ShAmt) {} |
+ |
+ Operand *createShiftedOperand(Cfg *Func, Variable *OpR) const { |
+ assert(OpR->mustHaveReg()); |
+ if (ShAmt == 0) { |
+ return OpR; |
+ } |
+ return OperandARM32FlexReg::create( |
+ Func, IceType_i32, OpR, OperandARM32::LSL, |
+ OperandARM32ShAmtImm::create( |
+ Func, llvm::cast<ConstantInteger32>( |
+ Func->getContext()->getConstantInt32(ShAmt)))); |
+ } |
+ |
+ bool aggregateWithAdd() const { |
+ switch (Op) { |
+ case AO_Invalid: |
+ llvm::report_fatal_error("Invalid Strength Reduction Operations."); |
+ case AO_Add: |
+ return true; |
+ case AO_Sub: |
+ return false; |
+ } |
+ } |
+ |
+ uint32_t shAmt() const { return ShAmt; } |
+ |
+private: |
+ AggregationOperation Op = AO_Invalid; |
+ uint32_t ShAmt; |
+}; |
+ |
+// [RangeStart, RangeEnd] is a range of 1s in Src. |
+template <std::size_t N> |
+bool addOperations(uint32_t RangeStart, uint32_t RangeEnd, SizeT *NumOperations, |
+ std::array<AggregationElement, N> *Operations) { |
+ assert(*NumOperations < N); |
+ if (RangeStart == RangeEnd) { |
+ // Single bit set: |
+ // Src : 0...00010... |
+ // RangeStart : ^ |
+ // RangeEnd : ^ |
+ // NegSrc : 0...00001... |
+ (*Operations)[*NumOperations] = AggregationElement(AO_Add, RangeStart); |
+ ++(*NumOperations); |
+ return true; |
+ } |
+ |
+ // Sequence of 1s: (two operations required.) |
+ // Src : 0...00011...110... |
+ // RangeStart : ^ |
+ // RangeEnd : ^ |
+ // NegSrc : 0...00000...001... |
+ if (*NumOperations + 1 >= N) { |
+ return false; |
+ } |
+ (*Operations)[*NumOperations] = AggregationElement(AO_Add, RangeStart + 1); |
+ ++(*NumOperations); |
+ (*Operations)[*NumOperations] = AggregationElement(AO_Sub, RangeEnd); |
+ ++(*NumOperations); |
+ return true; |
+} |
+ |
+// tryToOptmize scans Src looking for sequences of 1s (including the unitary bit |
+// 1 surrounded by zeroes. |
+template <std::size_t N> |
+bool tryToOptimize(uint32_t Src, SizeT *NumOperations, |
+ std::array<AggregationElement, N> *Operations) { |
+ constexpr uint32_t SrcSizeBits = sizeof(Src) * CHAR_BIT; |
+ uint32_t NegSrc = ~Src; |
+ |
+ *NumOperations = 0; |
+ while (Src != 0 && *NumOperations < N) { |
+ // Each step of the algorithm: |
+ // * finds L, the last bit set in Src; |
+ // * clears all the upper bits in NegSrc up to bit L; |
+ // * finds nL, the last bit set in NegSrc; |
+ // * clears all the upper bits in Src up to bit nL; |
+ // |
+ // if L == nL + 1, then a unitary 1 was found in Src. Otherwise, a sequence |
+ // of 1s starting at L, and ending at nL + 1, was found. |
+ const uint32_t SrcLastBitSet = llvm::findLastSet(Src); |
+ const uint32_t NegSrcClearMask = |
+ (SrcLastBitSet == 0) ? 0 |
+ : (0xFFFFFFFFu) >> (SrcSizeBits - SrcLastBitSet); |
+ NegSrc &= NegSrcClearMask; |
+ if (NegSrc == 0) { |
+ if (addOperations(SrcLastBitSet, 0, NumOperations, Operations)) { |
+ return true; |
+ } |
+ return false; |
+ } |
+ const uint32_t NegSrcLastBitSet = llvm::findLastSet(NegSrc); |
+ assert(NegSrcLastBitSet < SrcLastBitSet); |
+ const uint32_t SrcClearMask = |
+ (NegSrcLastBitSet == 0) ? 0 : (0xFFFFFFFFu) >> |
+ (SrcSizeBits - NegSrcLastBitSet); |
+ Src &= SrcClearMask; |
+ if (!addOperations(SrcLastBitSet, NegSrcLastBitSet + 1, NumOperations, |
+ Operations)) { |
+ return false; |
+ } |
+ } |
+ |
+ return Src == 0; |
+} |
+} // end of namespace StrengthReduction |
+} // end of anonymous namespace |
+ |
void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) { |
Variable *Dest = Inst->getDest(); |
@@ -2044,29 +2193,30 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) { |
return; |
} |
- if (Dest->getType() == IceType_i1) { |
+ Type DestTy = Dest->getType(); |
+ if (DestTy == IceType_i1) { |
lowerInt1Arithmetic(Inst); |
return; |
} |
Operand *Src0 = legalizeUndef(Inst->getSrc(0)); |
Operand *Src1 = legalizeUndef(Inst->getSrc(1)); |
- if (Dest->getType() == IceType_i64) { |
+ if (DestTy == IceType_i64) { |
lowerInt64Arithmetic(Inst->getOp(), Inst->getDest(), Src0, Src1); |
return; |
} |
- if (isVectorType(Dest->getType())) { |
+ if (isVectorType(DestTy)) { |
// Add a fake def to keep liveness consistent in the meantime. |
- Variable *T = makeReg(Dest->getType()); |
+ Variable *T = makeReg(DestTy); |
Context.insert(InstFakeDef::create(Func, T)); |
_mov(Dest, T); |
UnimplementedError(Func->getContext()->getFlags()); |
return; |
} |
- // Dest->getType() is a non-i64 scalar. |
- Variable *T = makeReg(Dest->getType()); |
+ // DestTy is a non-i64 scalar. |
+ Variable *T = makeReg(DestTy); |
// * Handle div/rem separately. They require a non-legalized Src1 to inspect |
// whether or not Src1 is a non-zero constant. Once legalized it is more |
@@ -2107,7 +2257,7 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) { |
case InstArithmetic::Frem: { |
constexpr SizeT MaxSrcs = 2; |
Variable *Src0R = legalizeToReg(Src0); |
- Type Ty = Dest->getType(); |
+ Type Ty = DestTy; |
InstCall *Call = makeHelperCall( |
isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64, Dest, MaxSrcs); |
Call->addArg(Src0R); |
@@ -2205,8 +2355,6 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) { |
} |
case InstArithmetic::Sub: { |
if (Srcs.hasConstOperand()) { |
- // TODO(jpp): lowering Src0R here is wrong -- Src0R it is not guaranteed |
- // to be used. |
if (Srcs.immediateIsFlexEncodable()) { |
Variable *Src0R = Srcs.src0R(this); |
Operand *Src1RF = Srcs.src1RF(this); |
@@ -2233,6 +2381,85 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) { |
return; |
} |
case InstArithmetic::Mul: { |
+ const bool OptM1 = Ctx->getFlags().getOptLevel() == Opt_m1; |
+ if (!OptM1 && Srcs.hasConstOperand()) { |
+ constexpr std::size_t MaxShifts = 4; |
+ std::array<StrengthReduction::AggregationElement, MaxShifts> Shifts; |
+ SizeT NumOperations; |
+ int32_t Const = Srcs.getConstantValue(); |
+ const bool Invert = Const < 0; |
+ const bool MultiplyByZero = Const == 0; |
+ Operand *_0 = |
+ legalize(Ctx->getConstantZero(DestTy), Legal_Reg | Legal_Flex); |
+ |
+ if (MultiplyByZero) { |
+ _mov(T, _0); |
+ _mov(Dest, T); |
+ return; |
+ } |
+ |
+ if (Invert) { |
+ Const = -Const; |
+ } |
+ |
+ if (StrengthReduction::tryToOptimize(Const, &NumOperations, &Shifts)) { |
+ assert(NumOperations >= 1); |
+ Variable *Src0R = Srcs.src0R(this); |
+ int32_t Start; |
+ int32_t End; |
+ if (NumOperations == 1 || Shifts[NumOperations - 1].shAmt() != 0) { |
+ // Multiplication by a power of 2 (NumOperations == 1); or |
+ // Multiplication by a even number not a power of 2. |
+ Start = 1; |
+ End = NumOperations; |
+ assert(Shifts[0].aggregateWithAdd()); |
+ _lsl(T, Src0R, shAmtImm(Shifts[0].shAmt())); |
+ } else { |
+ // Multiplication by an odd number. Put the free barrel shifter to a |
+ // good use. |
+ Start = 0; |
+ End = NumOperations - 2; |
+ const StrengthReduction::AggregationElement &Last = |
+ Shifts[NumOperations - 1]; |
+ const StrengthReduction::AggregationElement &SecondToLast = |
+ Shifts[NumOperations - 2]; |
+ if (!Last.aggregateWithAdd()) { |
+ assert(SecondToLast.aggregateWithAdd()); |
+ _rsb(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R)); |
+ } else if (!SecondToLast.aggregateWithAdd()) { |
+ assert(Last.aggregateWithAdd()); |
+ _sub(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R)); |
+ } else { |
+ _add(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R)); |
+ } |
+ } |
+ |
+ // Odd numbers : S E I I |
+ // +---+---+---+---+---+---+ ... +---+---+---+---+ |
+ // Shifts = | | | | | | | ... | | | | | |
+ // +---+---+---+---+---+---+ ... +---+---+---+---+ |
+ // Even numbers: I S E |
+ // |
+ // S: Start; E: End; I: Init |
+ for (int32_t I = Start; I < End; ++I) { |
+ const StrengthReduction::AggregationElement &Current = Shifts[I]; |
+ Operand *SrcF = Current.createShiftedOperand(Func, Src0R); |
+ if (Current.aggregateWithAdd()) { |
+ _add(T, T, SrcF); |
+ } else { |
+ _sub(T, T, SrcF); |
+ } |
+ } |
+ |
+ if (Invert) { |
+ // T = 0 - T. |
+ _rsb(T, T, _0); |
+ } |
+ |
+ _mov(Dest, T); |
+ return; |
+ } |
+ } |
Variable *Src0R = Srcs.unswappedSrc0R(this); |
Variable *Src1R = Srcs.unswappedSrc1R(this); |
_mul(T, Src0R, Src1R); |
@@ -2248,7 +2475,7 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) { |
} |
case InstArithmetic::Lshr: { |
Variable *Src0R = Srcs.unswappedSrc0R(this); |
- if (Dest->getType() != IceType_i32) { |
+ if (DestTy != IceType_i32) { |
_uxt(Src0R, Src0R); |
} |
_lsr(T, Src0R, Srcs.unswappedSrc1RShAmtImm(this)); |
@@ -2257,7 +2484,7 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) { |
} |
case InstArithmetic::Ashr: { |
Variable *Src0R = Srcs.unswappedSrc0R(this); |
- if (Dest->getType() != IceType_i32) { |
+ if (DestTy != IceType_i32) { |
_sxt(Src0R, Src0R); |
} |
_asr(T, Src0R, Srcs.unswappedSrc1RShAmtImm(this)); |