src/IceTargetLoweringARM32.cpp - Issue 1469113003: Subzero. ARM32. Strength reduce multiplications.

Side by Side Diff: src/IceTargetLoweringARM32.cpp

Issue 1469113003: Subzero. ARM32. Strength reduce multiplications. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master

Patch Set: Modified arith.ll to test the strength reduction output. Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 //===- subzero/src/IceTargetLoweringARM32.cpp - ARM32 lowering ------------===//	1 //===- subzero/src/IceTargetLoweringARM32.cpp - ARM32 lowering ------------===//

2 //	2 //

3 // The Subzero Code Generator	3 // The Subzero Code Generator

4 //	4 //

5 // This file is distributed under the University of Illinois Open Source	5 // This file is distributed under the University of Illinois Open Source

6 // License. See LICENSE.TXT for details.	6 // License. See LICENSE.TXT for details.

7 //	7 //

8 //===----------------------------------------------------------------------===//	8 //===----------------------------------------------------------------------===//

9 ///	9 ///

10 /// \file	10 /// \file

(...skipping 14 matching lines...) Expand all Loading...
25 #include "IceInstVarIter.h"	25 #include "IceInstVarIter.h"

26 #include "IceLiveness.h"	26 #include "IceLiveness.h"

27 #include "IceOperand.h"	27 #include "IceOperand.h"

28 #include "IcePhiLoweringImpl.h"	28 #include "IcePhiLoweringImpl.h"

29 #include "IceRegistersARM32.h"	29 #include "IceRegistersARM32.h"

30 #include "IceTargetLoweringARM32.def"	30 #include "IceTargetLoweringARM32.def"

31 #include "IceUtils.h"	31 #include "IceUtils.h"

32 #include "llvm/Support/MathExtras.h"	32 #include "llvm/Support/MathExtras.h"

33	33

34 #include <algorithm>	34 #include <algorithm>

	35 #include <array>

35 #include <utility>	36 #include <utility>

36	37

37 namespace Ice {	38 namespace Ice {

38	39

39 namespace {	40 namespace {

40	41

41 // The following table summarizes the logic for lowering the icmp instruction	42 // The following table summarizes the logic for lowering the icmp instruction

42 // for i32 and narrower types. Each icmp condition has a clear mapping to an	43 // for i32 and narrower types. Each icmp condition has a clear mapping to an

43 // ARM32 conditional move instruction.	44 // ARM32 conditional move instruction.

44	45

(...skipping 1984 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2029 case InstArithmetic::Udiv:	2030 case InstArithmetic::Udiv:

2030 case InstArithmetic::Sdiv:	2031 case InstArithmetic::Sdiv:

2031 case InstArithmetic::Urem:	2032 case InstArithmetic::Urem:

2032 case InstArithmetic::Srem:	2033 case InstArithmetic::Srem:

2033 llvm::report_fatal_error("Call-helper-involved instruction for i64 type "	2034 llvm::report_fatal_error("Call-helper-involved instruction for i64 type "

2034 "should have already been handled before");	2035 "should have already been handled before");

2035 return;	2036 return;

2036 }	2037 }

2037 }	2038 }

2038	2039

	2040 namespace {

	2041 // StrengthReduction is a namespace with the strength reduction machinery. The

	2042 // entry point is the StrengthReduction::tryToOptimize method. It returns true

	2043 // if the optimization can be performed, and false otherwise.

	2044 //

	2045 // If the optimization can be performed, tryToOptimize sets its NumOperations

	2046 // parameter to the number of shifts that are needed to perform the

	2047 // multiplication; and it sets the Operations parameter with <ShAmt, AddOrSub>

	2048 // tuples that describe how to materialize the multiplication.

	2049 //

	2050 // The algorithm finds contiguous 1s in the Multiplication source, and uses one

	2051 // or two shifts to materialize it. A sequence of 1s, i.e.
	Jim Stichnoth 2015/11/24 19:32:20 e.g. instead of i.e., right? e.g. instead of i.e., right? John 2015/11/24 20:26:55 Done. Show quoted text On 2015/11/24 19:32:20, stichnot wrote: > e.g. instead of i.e., right? Done.
	2052 //

	2053 // M N

	2054 // ...00000000000011111...111110000000...

	2055 //

	2056 // is materializable with (1 << (M + 1)) - (1 << N):

	2057 //

	2058 // ...00000000000100000...000000000000... [1 << (M + 1)]

	2059 // ...00000000000000000...000010000000... (-) [1 << N]

	2060 // --------------------------------------

	2061 // ...00000000000011111...111110000000...

	2062 //

	2063 // And a single bit set, i.e., is just a left shift.

	2064 namespace StrengthReduction {

	2065 enum AggregationOperation {

	2066 AO_Invalid,

	2067 AO_Add,

	2068 AO_Sub,

	2069 };

	2070

	2071 // AggregateElement is a glorified <ShAmt, AddOrSub> tuple.

	2072 class AggregationElement {

	2073 AggregationElement(const AggregationElement &) = delete;

	2074

	2075 public:

	2076 AggregationElement() = default;

	2077 AggregationElement &operator=(const AggregationElement &) = default;

	2078 AggregationElement(AggregationOperation Op, uint32_t ShAmt)

	2079 : Op(Op), ShAmt(ShAmt) {}

	2080

	2081 Operand createShiftedOperand(Cfg Func, Variable *OpR) const {

	2082 assert(OpR->mustHaveReg());

	2083 if (ShAmt == 0) {

	2084 return OpR;

	2085 }

	2086 return OperandARM32FlexReg::create(

	2087 Func, IceType_i32, OpR, OperandARM32::LSL,

	2088 OperandARM32ShAmtImm::create(

	2089 Func, llvm::cast<ConstantInteger32>(

	2090 Func->getContext()->getConstantInt32(ShAmt))));

	2091 }

	2092

	2093 bool aggregateWithAdd() const {

	2094 switch (Op) {

	2095 case AO_Invalid:

	2096 llvm::report_fatal_error("Invalid Strength Reduction Operations.");

	2097 case AO_Add:

	2098 return true;

	2099 case AO_Sub:

	2100 return false;

	2101 }

	2102 }

	2103

	2104 uint32_t shAmt() const { return ShAmt; }

	2105

	2106 private:

	2107 AggregationOperation Op = AO_Invalid;

	2108 uint32_t ShAmt = 33;
	Jim Stichnoth 2015/11/24 19:32:20 I assumed 33 is meant to be an invalid value when I assumed 33 is meant to be an invalid value when using the default zero-arg ctor? Can you add a comment like "invalid shift amount"? John 2015/11/24 20:26:55 This used to matter when this element was a const. Show quoted text On 2015/11/24 19:32:20, stichnot wrote: > I assumed 33 is meant to be an invalid value when using the default zero-arg > ctor? Can you add a comment like "invalid shift amount"? This used to matter when this element was a const. removed.
	2109 };

	2110

	2111 template <std::size_t N>

	2112 bool addOperations(uint32_t RangeStart, uint32_t RangeEnd, SizeT *NumOperations,
	Jim Stichnoth 2015/11/24 19:32:20 I think it might be helpful to document that [Rang I think it might be helpful to document that [RangeStart, RangeEnd] is a closed interval. John 2015/11/24 20:26:55 Done. Show quoted text On 2015/11/24 19:32:20, stichnot wrote: > I think it might be helpful to document that [RangeStart, RangeEnd] is a closed > interval. Done.
	2113 std::array<AggregationElement, N> *Operations) {

	2114 assert(*NumOperations < N);

	2115 if (RangeStart == RangeEnd) {

	2116 // Single bit set:

	2117 // Src : 0...00010...

	2118 // RangeStart : ^

	2119 // RangeEnd : ^

	2120 // NegSrc : 0...00001...

	2121 (Operations)[NumOperations] = AggregationElement(AO_Add, RangeStart);

	2122 ++(*NumOperations);

	2123 return true;

	2124 }

	2125

	2126 // Sequence of 1s: (two operations required.)

	2127 // Src : 0...00011...110...

	2128 // RangeStart : ^

	2129 // RangeEnd : ^

	2130 // NegSrc : 0...00000...001...

	2131 if (*NumOperations + 1 >= N) {

	2132 return false;

	2133 }

	2134 (Operations)[NumOperations] = AggregationElement(AO_Add, RangeStart + 1);

	2135 ++(*NumOperations);

	2136 (Operations)[NumOperations] = AggregationElement(AO_Sub, RangeEnd);

	2137 ++(*NumOperations);

	2138 return true;

	2139 }

	2140

	2141 // tryToOptmize scans Src looking for sequences of 1s (including the unitary bit

	2142 // 1 surrounded by zeroes.

	2143 template <std::size_t N>

	2144 bool tryToOptimize(uint32_t Src, SizeT *NumOperations,

	2145 std::array<AggregationElement, N> *Operations) {

	2146 constexpr uint32_t SrcSizeBits = sizeof(Src) * CHAR_BIT;

	2147 uint32_t NegSrc = ~Src;

	2148

	2149 *NumOperations = 0;

	2150 while (Src != 0 && *NumOperations < N) {

	2151 // Each step of the algorithm:

	2152 // * finds L, the last bit set in Src;

	2153 // * clears all the upper bits in NegSrc up bit L;
	Jim Stichnoth 2015/11/24 19:32:20 up to bit up to bit John 2015/11/24 20:26:55 Done. Show quoted text On 2015/11/24 19:32:20, stichnot wrote: > up to bit Done.
	2154 // * finds nL, the last bit set in NegSrc;

	2155 // * clears all the upper bits in Src up to bit nL;

	2156 //

	2157 // if L == nL + 1, then a unitary 1 was found in Src. Otherwise, a sequence

	2158 // of 1s starting at L, and ending a nL + 1, was found.
	Jim Stichnoth 2015/11/24 19:32:20 ending at ending at John 2015/11/24 20:26:55 Done. Show quoted text On 2015/11/24 19:32:20, stichnot wrote: > ending at Done.
	2159 const uint32_t SrcLastBitSet = llvm::findLastSet(Src);

	2160 const uint32_t NegSrcClearMask =

	2161 (SrcLastBitSet == 0) ? 0

	2162 : (0xFFFFFFFFu) >> (SrcSizeBits - SrcLastBitSet);

	2163 NegSrc &= NegSrcClearMask;

	2164 if (NegSrc == 0) {

	2165 if (addOperations(SrcLastBitSet, 0, NumOperations, Operations)) {

	2166 return true;

	2167 }

	2168 return false;

	2169 }

	2170 const uint32_t NegSrcLastBitSet = llvm::findLastSet(NegSrc);

	2171 assert(NegSrcLastBitSet < SrcLastBitSet);

	2172 const uint32_t SrcClearMask =

	2173 (NegSrcLastBitSet == 0) ? 0 : (0xFFFFFFFFu) >>

	2174 (SrcSizeBits - NegSrcLastBitSet);

	2175 Src &= SrcClearMask;

	2176 if (!addOperations(SrcLastBitSet, NegSrcLastBitSet + 1, NumOperations,

	2177 Operations)) {

	2178 return false;

	2179 }

	2180 }

	2181

	2182 return Src == 0;

	2183 }

	2184 } // end of namespace StrengthReduction

	2185 } // end of anonymous namespace

	2186

2039 void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {	2187 void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {

2040 Variable *Dest = Inst->getDest();	2188 Variable *Dest = Inst->getDest();

2041	2189

2042 if (Dest->isRematerializable()) {	2190 if (Dest->isRematerializable()) {

2043 Context.insert(InstFakeDef::create(Func, Dest));	2191 Context.insert(InstFakeDef::create(Func, Dest));

2044 return;	2192 return;

2045 }	2193 }

2046	2194

2047 if (Dest->getType() == IceType_i1) {	2195 if (Dest->getType() == IceType_i1) {

2048 lowerInt1Arithmetic(Inst);	2196 lowerInt1Arithmetic(Inst);

(...skipping 149 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2198 }	2346 }

2199 case InstArithmetic::Xor: {	2347 case InstArithmetic::Xor: {

2200 Variable *Src0R = Srcs.src0R(this);	2348 Variable *Src0R = Srcs.src0R(this);

2201 Operand *Src1RF = Srcs.src1RF(this);	2349 Operand *Src1RF = Srcs.src1RF(this);

2202 _eor(T, Src0R, Src1RF);	2350 _eor(T, Src0R, Src1RF);

2203 _mov(Dest, T);	2351 _mov(Dest, T);

2204 return;	2352 return;

2205 }	2353 }

2206 case InstArithmetic::Sub: {	2354 case InstArithmetic::Sub: {

2207 if (Srcs.hasConstOperand()) {	2355 if (Srcs.hasConstOperand()) {

2208 // TODO(jpp): lowering Src0R here is wrong -- Src0R it is not guaranteed

2209 // to be used.

2210 if (Srcs.immediateIsFlexEncodable()) {	2356 if (Srcs.immediateIsFlexEncodable()) {

2211 Variable *Src0R = Srcs.src0R(this);	2357 Variable *Src0R = Srcs.src0R(this);

2212 Operand *Src1RF = Srcs.src1RF(this);	2358 Operand *Src1RF = Srcs.src1RF(this);

2213 if (Srcs.swappedOperands()) {	2359 if (Srcs.swappedOperands()) {

2214 _rsb(T, Src0R, Src1RF);	2360 _rsb(T, Src0R, Src1RF);

2215 } else {	2361 } else {

2216 _sub(T, Src0R, Src1RF);	2362 _sub(T, Src0R, Src1RF);

2217 }	2363 }

2218 _mov(Dest, T);	2364 _mov(Dest, T);

2219 return;	2365 return;

2220 }	2366 }

2221 if (!Srcs.swappedOperands() && Srcs.negatedImmediateIsFlexEncodable()) {	2367 if (!Srcs.swappedOperands() && Srcs.negatedImmediateIsFlexEncodable()) {

2222 Variable *Src0R = Srcs.src0R(this);	2368 Variable *Src0R = Srcs.src0R(this);

2223 Operand *Src1F = Srcs.negatedSrc1F(this);	2369 Operand *Src1F = Srcs.negatedSrc1F(this);

2224 _add(T, Src0R, Src1F);	2370 _add(T, Src0R, Src1F);

2225 _mov(Dest, T);	2371 _mov(Dest, T);

2226 return;	2372 return;

2227 }	2373 }

2228 }	2374 }

2229 Variable *Src0R = Srcs.unswappedSrc0R(this);	2375 Variable *Src0R = Srcs.unswappedSrc0R(this);

2230 Variable *Src1R = Srcs.unswappedSrc1R(this);	2376 Variable *Src1R = Srcs.unswappedSrc1R(this);

2231 _sub(T, Src0R, Src1R);	2377 _sub(T, Src0R, Src1R);

2232 _mov(Dest, T);	2378 _mov(Dest, T);

2233 return;	2379 return;

2234 }	2380 }

2235 case InstArithmetic::Mul: {	2381 case InstArithmetic::Mul: {

	2382 const bool OptM1 = Ctx->getFlags().getOptLevel() == Opt_m1;
	Jim Stichnoth 2015/11/24 19:32:20 In IceTargetLoweringX86BaseImpl.h, strength-reduce In IceTargetLoweringX86BaseImpl.h, strength-reduced multiplication is done for Opt_1 and above (i.e. not Opt_0). This was kind of an arbitrary decision and should be revisited, but it would be good to have a consistent policy across targets. John 2015/11/24 20:26:55 In this case, I ran ammp with 4 and 5 shifts, and Show quoted text On 2015/11/24 19:32:20, stichnot wrote: > In IceTargetLoweringX86BaseImpl.h, strength-reduced multiplication is done for > Opt_1 and above (i.e. not Opt_0). This was kind of an arbitrary decision and > should be revisited, but it would be good to have a consistent policy across > targets. In this case, I ran ammp with 4 and 5 shifts, and 5 was slightly/marginally worse. 4 allows multiplications by constants that have two "ranges" of 1s.
	2383 if (!OptM1 && Srcs.hasConstOperand()) {

	2384 constexpr std::size_t MaxShifts = 4;

	2385 std::array<StrengthReduction::AggregationElement, MaxShifts> Shifts;

	2386 SizeT NumOperations;

	2387 int32_t Const = Srcs.getConstantValue();

	2388 const bool Invert = Const < 0;

	2389 const bool MultiplyByZero = Const == 0;

	2390 Operand *_0 = legalize(Ctx->getConstantInt32(0), Legal_Reg \| Legal_Flex);
	Jim Stichnoth 2015/11/24 19:32:20 Instead of getConstantInt32(0), it may be better t Instead of getConstantInt32(0), it may be better to use getConstantZero(Dest->getType()), in case we care down the road about type consistency in the operands. John 2015/11/24 20:26:55 Done. Show quoted text On 2015/11/24 19:32:20, stichnot wrote: > Instead of getConstantInt32(0), it may be better to use > getConstantZero(Dest->getType()), in case we care down the road about type > consistency in the operands. Done.
	2391

	2392 if (MultiplyByZero) {

	2393 _mov(T, _0);

	2394 _mov(Dest, T);

	2395 return;

	2396 }

	2397

	2398 if (Invert) {

	2399 Const = -Const;

	2400 }

	2401

	2402 if (StrengthReduction::tryToOptimize(Const, &NumOperations, &Shifts)) {

	2403 assert(NumOperations >= 1);

	2404 Variable *Src0R = Srcs.src0R(this);

	2405 int32_t Start;

	2406 int32_t End;

	2407 if (NumOperations == 1 \|\| Shifts[NumOperations - 1].shAmt() != 0) {

	2408 // Multiplication by a power of 2 (NumOperations == 1); or

	2409 // Multiplication by a even number not a power of 2.

	2410 Start = 1;

	2411 End = NumOperations;

	2412 assert(Shifts[0].aggregateWithAdd());

	2413 _lsl(T, Src0R, shAmtImm(Shifts[0].shAmt()));

	2414 } else {

	2415 // Multiplication by an odd number. Put the free barrel shifter to a

	2416 // good use.

	2417 Start = 0;

	2418 End = NumOperations - 2;

	2419 const StrengthReduction::AggregationElement &Last =

	2420 Shifts[NumOperations - 1];

	2421 const StrengthReduction::AggregationElement &SecondToLast =

	2422 Shifts[NumOperations - 2];

	2423 if (!Last.aggregateWithAdd()) {

	2424 assert(SecondToLast.aggregateWithAdd());

	2425 _rsb(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R));

	2426 } else if (!SecondToLast.aggregateWithAdd()) {

	2427 assert(Last.aggregateWithAdd());

	2428 _sub(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R));

	2429 } else {

	2430 _add(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R));

	2431 }

	2432 }

	2433

	2434 // Odd numbers : S E I I

	2435 // +---+---+---+---+---+---+ ... +---+---+---+---+

	2436 // Shifts = \| \| \| \| \| \| \| ... \| \| \| \| \|

	2437 // +---+---+---+---+---+---+ ... +---+---+---+---+

	2438 // Even numbers: I S E

	2439 //

	2440 // S: Start; E: End; I: Init

	2441 for (int32_t I = Start; I < End; ++I) {

	2442 const StrengthReduction::AggregationElement &Current = Shifts[I];

	2443 Operand *SrcF = Current.createShiftedOperand(Func, Src0R);

	2444 if (Current.aggregateWithAdd()) {

	2445 _add(T, T, SrcF);

	2446 } else {

	2447 _sub(T, T, SrcF);

	2448 }

	2449 }

	2450

	2451 if (Invert) {

	2452 // T = 0 - T.

	2453 _rsb(T, T, _0);

	2454 }

	2455

	2456 _mov(Dest, T);

	2457 return;

	2458 }

	2459 }

2236 Variable *Src0R = Srcs.unswappedSrc0R(this);	2460 Variable *Src0R = Srcs.unswappedSrc0R(this);

2237 Variable *Src1R = Srcs.unswappedSrc1R(this);	2461 Variable *Src1R = Srcs.unswappedSrc1R(this);

2238 _mul(T, Src0R, Src1R);	2462 _mul(T, Src0R, Src1R);

2239 _mov(Dest, T);	2463 _mov(Dest, T);

2240 return;	2464 return;

2241 }	2465 }

2242 case InstArithmetic::Shl: {	2466 case InstArithmetic::Shl: {

2243 Variable *Src0R = Srcs.unswappedSrc0R(this);	2467 Variable *Src0R = Srcs.unswappedSrc0R(this);

2244 Operand *Src1R = Srcs.unswappedSrc1RShAmtImm(this);	2468 Operand *Src1R = Srcs.unswappedSrc1RShAmtImm(this);

2245 _lsl(T, Src0R, Src1R);	2469 _lsl(T, Src0R, Src1R);

(...skipping 3257 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5503 // Technically R9 is used for TLS with Sandboxing, and we reserve it.	5727 // Technically R9 is used for TLS with Sandboxing, and we reserve it.

5504 // However, for compatibility with current NaCl LLVM, don't claim that.	5728 // However, for compatibility with current NaCl LLVM, don't claim that.

5505 Str << ".eabi_attribute 14, 3 @ Tag_ABI_PCS_R9_use: Not used\n";	5729 Str << ".eabi_attribute 14, 3 @ Tag_ABI_PCS_R9_use: Not used\n";

5506 }	5730 }

5507	5731

5508 llvm::SmallBitVector TargetARM32::TypeToRegisterSet[IceType_NUM];	5732 llvm::SmallBitVector TargetARM32::TypeToRegisterSet[IceType_NUM];

5509 llvm::SmallBitVector TargetARM32::RegisterAliases[RegARM32::Reg_NUM];	5733 llvm::SmallBitVector TargetARM32::RegisterAliases[RegARM32::Reg_NUM];

5510 llvm::SmallBitVector TargetARM32::ScratchRegs;	5734 llvm::SmallBitVector TargetARM32::ScratchRegs;

5511	5735

5512 } // end of namespace Ice	5736 } // end of namespace Ice

OLD	NEW

« no previous file with comments | « src/IceTargetLoweringARM32.h ('k') | tests_lit/llvm2ice_tests/arith.ll » ('j') | no next file with comments »