src/IceTargetLoweringARM32.cpp - Issue 1469113003: Subzero. ARM32. Strength reduce multiplications.

Unified Diff: src/IceTargetLoweringARM32.cpp

Issue 1469113003: Subzero. ARM32. Strength reduce multiplications. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master

Patch Set: Addresses comments. Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/IceTargetLoweringARM32.cpp

diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp

index f23609b7837cce25402e4d1382ffa9741273d02c..7eb4e8ad446f540ef10a4f12aa9d8c6cb4a29e33 100644

--- a/src/IceTargetLoweringARM32.cpp

+++ b/src/IceTargetLoweringARM32.cpp

@@ -32,6 +32,7 @@

#include "llvm/Support/MathExtras.h"

#include <algorithm>

+#include <array>

#include <utility>

namespace Ice {

@@ -2036,6 +2037,154 @@ void TargetARM32::lowerInt64Arithmetic(InstArithmetic::OpKind Op,

}

+namespace {

+// StrengthReduction is a namespace with the strength reduction machinery. The

+// entry point is the StrengthReduction::tryToOptimize method. It returns true

+// if the optimization can be performed, and false otherwise.

+//

+// If the optimization can be performed, tryToOptimize sets its NumOperations

+// parameter to the number of shifts that are needed to perform the

+// multiplication; and it sets the Operations parameter with <ShAmt, AddOrSub>

+// tuples that describe how to materialize the multiplication.

+//

+// The algorithm finds contiguous 1s in the Multiplication source, and uses one

+// or two shifts to materialize it. A sequence of 1s, e.g.,

+//

+// M N

+// ...00000000000011111...111110000000...

+//

+// is materializable with (1 << (M + 1)) - (1 << N):

+//

+// ...00000000000100000...000000000000... [1 << (M + 1)]

+// ...00000000000000000...000010000000... (-) [1 << N]

+// --------------------------------------

+// ...00000000000011111...111110000000...

+//

+// And a single bit set, which is just a left shift.

+namespace StrengthReduction {

+enum AggregationOperation {

+ AO_Invalid,

+ AO_Add,

+ AO_Sub,

+};

+// AggregateElement is a glorified <ShAmt, AddOrSub> tuple.

+class AggregationElement {

+ AggregationElement(const AggregationElement &) = delete;

+public:

+ AggregationElement() = default;

+ AggregationElement &operator=(const AggregationElement &) = default;

+ AggregationElement(AggregationOperation Op, uint32_t ShAmt)

+ : Op(Op), ShAmt(ShAmt) {}

+ Operand *createShiftedOperand(Cfg *Func, Variable *OpR) const {

+ assert(OpR->mustHaveReg());

+ if (ShAmt == 0) {

+ return OpR;

+ }

+ return OperandARM32FlexReg::create(

+ Func, IceType_i32, OpR, OperandARM32::LSL,

+ OperandARM32ShAmtImm::create(

+ Func, llvm::cast<ConstantInteger32>(

+ Func->getContext()->getConstantInt32(ShAmt))));

+ }

+ bool aggregateWithAdd() const {

+ switch (Op) {

+ case AO_Invalid:

+ llvm::report_fatal_error("Invalid Strength Reduction Operations.");

+ case AO_Add:

+ return true;

+ case AO_Sub:

+ return false;

+ }

+ uint32_t shAmt() const { return ShAmt; }

+private:

+ AggregationOperation Op = AO_Invalid;

+ uint32_t ShAmt;

+};

+// [RangeStart, RangeEnd] is a range of 1s in Src.

+template <std::size_t N>

+bool addOperations(uint32_t RangeStart, uint32_t RangeEnd, SizeT *NumOperations,

+ std::array<AggregationElement, N> *Operations) {

+ assert(*NumOperations < N);

+ if (RangeStart == RangeEnd) {

+ // Single bit set:

+ // Src : 0...00010...

+ // RangeStart : ^

+ // RangeEnd : ^

+ // NegSrc : 0...00001...

+ (*Operations)[*NumOperations] = AggregationElement(AO_Add, RangeStart);

+ ++(*NumOperations);

+ return true;

+ }

+ // Sequence of 1s: (two operations required.)

+ // Src : 0...00011...110...

+ // RangeStart : ^

+ // RangeEnd : ^

+ // NegSrc : 0...00000...001...

+ if (*NumOperations + 1 >= N) {

+ return false;

+ }

+ (*Operations)[*NumOperations] = AggregationElement(AO_Add, RangeStart + 1);

+ ++(*NumOperations);

+ (*Operations)[*NumOperations] = AggregationElement(AO_Sub, RangeEnd);

+ ++(*NumOperations);

+ return true;

+// tryToOptmize scans Src looking for sequences of 1s (including the unitary bit

+// 1 surrounded by zeroes.

+template <std::size_t N>

+bool tryToOptimize(uint32_t Src, SizeT *NumOperations,

+ std::array<AggregationElement, N> *Operations) {

+ constexpr uint32_t SrcSizeBits = sizeof(Src) * CHAR_BIT;

+ uint32_t NegSrc = ~Src;

+ *NumOperations = 0;

+ while (Src != 0 && *NumOperations < N) {

+ // Each step of the algorithm:

+ // * finds L, the last bit set in Src;

+ // * clears all the upper bits in NegSrc up to bit L;

+ // * finds nL, the last bit set in NegSrc;

+ // * clears all the upper bits in Src up to bit nL;

+ //

+ // if L == nL + 1, then a unitary 1 was found in Src. Otherwise, a sequence

+ // of 1s starting at L, and ending at nL + 1, was found.

+ const uint32_t SrcLastBitSet = llvm::findLastSet(Src);

+ const uint32_t NegSrcClearMask =

+ (SrcLastBitSet == 0) ? 0

+ : (0xFFFFFFFFu) >> (SrcSizeBits - SrcLastBitSet);

+ NegSrc &= NegSrcClearMask;

+ if (NegSrc == 0) {

+ if (addOperations(SrcLastBitSet, 0, NumOperations, Operations)) {

+ return true;

+ }

+ return false;

+ }

+ const uint32_t NegSrcLastBitSet = llvm::findLastSet(NegSrc);

+ assert(NegSrcLastBitSet < SrcLastBitSet);

+ const uint32_t SrcClearMask =

+ (NegSrcLastBitSet == 0) ? 0 : (0xFFFFFFFFu) >>

+ (SrcSizeBits - NegSrcLastBitSet);

+ Src &= SrcClearMask;

+ if (!addOperations(SrcLastBitSet, NegSrcLastBitSet + 1, NumOperations,

+ Operations)) {

+ return false;

+ }

+ return Src == 0;

+} // end of namespace StrengthReduction

+} // end of anonymous namespace

void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {

Variable *Dest = Inst->getDest();

@@ -2044,29 +2193,30 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {

return;

}

- if (Dest->getType() == IceType_i1) {

+ Type DestTy = Dest->getType();

+ if (DestTy == IceType_i1) {

lowerInt1Arithmetic(Inst);

return;

}

Operand *Src0 = legalizeUndef(Inst->getSrc(0));

Operand *Src1 = legalizeUndef(Inst->getSrc(1));

- if (Dest->getType() == IceType_i64) {

+ if (DestTy == IceType_i64) {

lowerInt64Arithmetic(Inst->getOp(), Inst->getDest(), Src0, Src1);

return;

}

- if (isVectorType(Dest->getType())) {

+ if (isVectorType(DestTy)) {

// Add a fake def to keep liveness consistent in the meantime.

- Variable *T = makeReg(Dest->getType());

+ Variable *T = makeReg(DestTy);

Context.insert(InstFakeDef::create(Func, T));

_mov(Dest, T);

UnimplementedError(Func->getContext()->getFlags());

return;

}

- // Dest->getType() is a non-i64 scalar.

- Variable *T = makeReg(Dest->getType());

+ // DestTy is a non-i64 scalar.

+ Variable *T = makeReg(DestTy);

// * Handle div/rem separately. They require a non-legalized Src1 to inspect

// whether or not Src1 is a non-zero constant. Once legalized it is more

@@ -2107,7 +2257,7 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {

case InstArithmetic::Frem: {

constexpr SizeT MaxSrcs = 2;

Variable *Src0R = legalizeToReg(Src0);

- Type Ty = Dest->getType();

+ Type Ty = DestTy;

InstCall *Call = makeHelperCall(

isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64, Dest, MaxSrcs);

Call->addArg(Src0R);

@@ -2205,8 +2355,6 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {

}

case InstArithmetic::Sub: {

if (Srcs.hasConstOperand()) {

- // TODO(jpp): lowering Src0R here is wrong -- Src0R it is not guaranteed

- // to be used.

if (Srcs.immediateIsFlexEncodable()) {

Variable *Src0R = Srcs.src0R(this);

Operand *Src1RF = Srcs.src1RF(this);

@@ -2233,6 +2381,85 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {

return;

}

case InstArithmetic::Mul: {

+ const bool OptM1 = Ctx->getFlags().getOptLevel() == Opt_m1;

+ if (!OptM1 && Srcs.hasConstOperand()) {

+ constexpr std::size_t MaxShifts = 4;

+ std::array<StrengthReduction::AggregationElement, MaxShifts> Shifts;

+ SizeT NumOperations;

+ int32_t Const = Srcs.getConstantValue();

+ const bool Invert = Const < 0;

+ const bool MultiplyByZero = Const == 0;

+ Operand *_0 =

+ legalize(Ctx->getConstantZero(DestTy), Legal_Reg | Legal_Flex);

+ if (MultiplyByZero) {

+ _mov(T, _0);

+ _mov(Dest, T);

+ return;

+ }

+ if (Invert) {

+ Const = -Const;

+ }

+ if (StrengthReduction::tryToOptimize(Const, &NumOperations, &Shifts)) {

+ assert(NumOperations >= 1);

+ Variable *Src0R = Srcs.src0R(this);

+ int32_t Start;

+ int32_t End;

+ if (NumOperations == 1 || Shifts[NumOperations - 1].shAmt() != 0) {

+ // Multiplication by a power of 2 (NumOperations == 1); or

+ // Multiplication by a even number not a power of 2.

+ Start = 1;

+ End = NumOperations;

+ assert(Shifts[0].aggregateWithAdd());

+ _lsl(T, Src0R, shAmtImm(Shifts[0].shAmt()));

+ } else {

+ // Multiplication by an odd number. Put the free barrel shifter to a

+ // good use.

+ Start = 0;

+ End = NumOperations - 2;

+ const StrengthReduction::AggregationElement &Last =

+ Shifts[NumOperations - 1];

+ const StrengthReduction::AggregationElement &SecondToLast =

+ Shifts[NumOperations - 2];

+ if (!Last.aggregateWithAdd()) {

+ assert(SecondToLast.aggregateWithAdd());

+ _rsb(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R));

+ } else if (!SecondToLast.aggregateWithAdd()) {

+ assert(Last.aggregateWithAdd());

+ _sub(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R));

+ } else {

+ _add(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R));

+ }

+ // Odd numbers : S E I I

+ // +---+---+---+---+---+---+ ... +---+---+---+---+

+ // Shifts = | | | | | | | ... | | | | |

+ // +---+---+---+---+---+---+ ... +---+---+---+---+

+ // Even numbers: I S E

+ //

+ // S: Start; E: End; I: Init

+ for (int32_t I = Start; I < End; ++I) {

+ const StrengthReduction::AggregationElement &Current = Shifts[I];

+ Operand *SrcF = Current.createShiftedOperand(Func, Src0R);

+ if (Current.aggregateWithAdd()) {

+ _add(T, T, SrcF);

+ } else {

+ _sub(T, T, SrcF);

+ }

+ if (Invert) {

+ // T = 0 - T.

+ _rsb(T, T, _0);

+ }

+ _mov(Dest, T);

+ return;

+ }

Variable *Src0R = Srcs.unswappedSrc0R(this);

Variable *Src1R = Srcs.unswappedSrc1R(this);

_mul(T, Src0R, Src1R);

@@ -2248,7 +2475,7 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {

}

case InstArithmetic::Lshr: {

Variable *Src0R = Srcs.unswappedSrc0R(this);

- if (Dest->getType() != IceType_i32) {

+ if (DestTy != IceType_i32) {

_uxt(Src0R, Src0R);

}

_lsr(T, Src0R, Srcs.unswappedSrc1RShAmtImm(this));

@@ -2257,7 +2484,7 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {

}

case InstArithmetic::Ashr: {

Variable *Src0R = Srcs.unswappedSrc0R(this);

- if (Dest->getType() != IceType_i32) {

+ if (DestTy != IceType_i32) {

_sxt(Src0R, Src0R);

}

_asr(T, Src0R, Srcs.unswappedSrc1RShAmtImm(this));

« no previous file with comments | « src/IceTargetLoweringARM32.h ('k') | tests_lit/llvm2ice_tests/arith.ll » ('j') | no next file with comments »