OLD | NEW |
---|---|
1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===// | 1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===// |
2 // | 2 // |
3 // The Subzero Code Generator | 3 // The Subzero Code Generator |
4 // | 4 // |
5 // This file is distributed under the University of Illinois Open Source | 5 // This file is distributed under the University of Illinois Open Source |
6 // License. See LICENSE.TXT for details. | 6 // License. See LICENSE.TXT for details. |
7 // | 7 // |
8 //===----------------------------------------------------------------------===// | 8 //===----------------------------------------------------------------------===// |
9 // | 9 // |
10 // This file implements the TargetLoweringX8632 class, which | 10 // This file implements the TargetLoweringX8632 class, which |
(...skipping 21 matching lines...) Expand all Loading... | |
32 // lowerFcmp() describes the lowering template. In the most general case, there | 32 // lowerFcmp() describes the lowering template. In the most general case, there |
33 // is a compare followed by two conditional branches, because some fcmp | 33 // is a compare followed by two conditional branches, because some fcmp |
34 // conditions don't map to a single x86 conditional branch. However, in many | 34 // conditions don't map to a single x86 conditional branch. However, in many |
35 // cases it is possible to swap the operands in the comparison and have a single | 35 // cases it is possible to swap the operands in the comparison and have a single |
36 // conditional branch. Since it's quite tedious to validate the table by hand, | 36 // conditional branch. Since it's quite tedious to validate the table by hand, |
37 // good execution tests are helpful. | 37 // good execution tests are helpful. |
38 | 38 |
39 const struct TableFcmp_ { | 39 const struct TableFcmp_ { |
40 uint32_t Default; | 40 uint32_t Default; |
41 bool SwapOperands; | 41 bool SwapOperands; |
42 InstX8632Br::BrCond C1, C2; | 42 InstX8632::BrCond C1, C2; |
43 } TableFcmp[] = { | 43 } TableFcmp[] = { |
44 #define X(val, dflt, swap, C1, C2) \ | 44 #define X(val, dflt, swap, C1, C2) \ |
45 { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 } \ | 45 { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 } \ |
46 , | 46 , |
47 FCMPX8632_TABLE | 47 FCMPX8632_TABLE |
48 #undef X | 48 #undef X |
49 }; | 49 }; |
50 const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp); | 50 const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp); |
51 | 51 |
52 // The following table summarizes the logic for lowering the icmp instruction | 52 // The following table summarizes the logic for lowering the icmp instruction |
53 // for i32 and narrower types. Each icmp condition has a clear mapping to an | 53 // for i32 and narrower types. Each icmp condition has a clear mapping to an |
54 // x86 conditional branch instruction. | 54 // x86 conditional branch instruction. |
55 | 55 |
56 const struct TableIcmp32_ { | 56 const struct TableIcmp32_ { |
57 InstX8632Br::BrCond Mapping; | 57 InstX8632::BrCond Mapping; |
58 } TableIcmp32[] = { | 58 } TableIcmp32[] = { |
59 #define X(val, C_32, C1_64, C2_64, C3_64) \ | 59 #define X(val, C_32, C1_64, C2_64, C3_64) \ |
60 { InstX8632Br::C_32 } \ | 60 { InstX8632Br::C_32 } \ |
61 , | 61 , |
62 ICMPX8632_TABLE | 62 ICMPX8632_TABLE |
63 #undef X | 63 #undef X |
64 }; | 64 }; |
65 const size_t TableIcmp32Size = llvm::array_lengthof(TableIcmp32); | 65 const size_t TableIcmp32Size = llvm::array_lengthof(TableIcmp32); |
66 | 66 |
67 // The following table summarizes the logic for lowering the icmp instruction | 67 // The following table summarizes the logic for lowering the icmp instruction |
68 // for the i64 type. For Eq and Ne, two separate 32-bit comparisons and | 68 // for the i64 type. For Eq and Ne, two separate 32-bit comparisons and |
69 // conditional branches are needed. For the other conditions, three separate | 69 // conditional branches are needed. For the other conditions, three separate |
70 // conditional branches are needed. | 70 // conditional branches are needed. |
71 const struct TableIcmp64_ { | 71 const struct TableIcmp64_ { |
72 InstX8632Br::BrCond C1, C2, C3; | 72 InstX8632::BrCond C1, C2, C3; |
73 } TableIcmp64[] = { | 73 } TableIcmp64[] = { |
74 #define X(val, C_32, C1_64, C2_64, C3_64) \ | 74 #define X(val, C_32, C1_64, C2_64, C3_64) \ |
75 { InstX8632Br::C1_64, InstX8632Br::C2_64, InstX8632Br::C3_64 } \ | 75 { InstX8632Br::C1_64, InstX8632Br::C2_64, InstX8632Br::C3_64 } \ |
76 , | 76 , |
77 ICMPX8632_TABLE | 77 ICMPX8632_TABLE |
78 #undef X | 78 #undef X |
79 }; | 79 }; |
80 const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64); | 80 const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64); |
81 | 81 |
82 InstX8632Br::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) { | 82 InstX8632::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) { |
83 size_t Index = static_cast<size_t>(Cond); | 83 size_t Index = static_cast<size_t>(Cond); |
84 assert(Index < TableIcmp32Size); | 84 assert(Index < TableIcmp32Size); |
85 return TableIcmp32[Index].Mapping; | 85 return TableIcmp32[Index].Mapping; |
86 } | 86 } |
87 | 87 |
88 // The maximum number of arguments to pass in XMM registers | 88 // The maximum number of arguments to pass in XMM registers |
89 const unsigned X86_MAX_XMM_ARGS = 4; | 89 const unsigned X86_MAX_XMM_ARGS = 4; |
90 | 90 |
91 // In some cases, there are x-macros tables for both high-level and | 91 // In some cases, there are x-macros tables for both high-level and |
92 // low-level instructions/operands that use the same enum key value. | 92 // low-level instructions/operands that use the same enum key value. |
(...skipping 2009 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2102 _storeq(T, Addr); | 2102 _storeq(T, Addr); |
2103 _mfence(); | 2103 _mfence(); |
2104 return; | 2104 return; |
2105 } | 2105 } |
2106 InstStore *Store = InstStore::create(Func, Value, Ptr); | 2106 InstStore *Store = InstStore::create(Func, Value, Ptr); |
2107 lowerStore(Store); | 2107 lowerStore(Store); |
2108 _mfence(); | 2108 _mfence(); |
2109 return; | 2109 return; |
2110 } | 2110 } |
2111 case Intrinsics::Bswap: | 2111 case Intrinsics::Bswap: |
2112 case Intrinsics::Ctlz: | |
2113 case Intrinsics::Ctpop: | |
2114 case Intrinsics::Cttz: | |
2115 // TODO(jvoung): fill it in. | |
2116 Func->setError("Unhandled intrinsic"); | 2112 Func->setError("Unhandled intrinsic"); |
2117 return; | 2113 return; |
2114 case Intrinsics::Ctpop: { | |
2115 Variable *Dest = Instr->getDest(); | |
2116 Operand *Val = Instr->getArg(0); | |
2117 InstCall *Call = makeHelperCall(Val->getType() == IceType_i32 ? | |
Jim Stichnoth
2014/07/15 22:08:09
Here and below, I'd prefer to make type comparison
jvoung (off chromium)
2014/07/15 23:10:33
Done.
Yep, would be good to identify 64-bit speci
| |
2118 "__popcountsi2" : "__popcountdi2", Dest, 1); | |
2119 Call->addArg(Val); | |
2120 lowerCall(Call); | |
2121 // The popcount helpers always return 32-bit values, while the intrinsic's | |
2122 // signature matches the native POPCNT instruction and fills a 64-bit reg | |
2123 // (in 64-bit mode). Thus, clear the upper bits of the dest just in case | |
2124 // the user doesn't do that in the IR. If the user does that in the IR, | |
2125 // then this zero'ing instruction is dead and gets optimized out. | |
2126 if (Val->getType() == IceType_i64) { | |
2127 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest)); | |
2128 Constant *Zero = Ctx->getConstantZero(IceType_i32); | |
2129 _mov(DestHi, Zero); | |
2130 } | |
2131 return; | |
2132 } | |
2133 case Intrinsics::Ctlz: { | |
2134 // The "is zero undef" parameter is ignored and we always return | |
2135 // a well-defined value. | |
2136 const bool IsCttz = false; | |
2137 Operand *Val = legalize(Instr->getArg(0)); | |
2138 Operand *FirstVal; | |
2139 Operand *SecondVal = NULL; | |
2140 if (Val->getType() == IceType_i32) { | |
2141 FirstVal = Val; | |
2142 } else { | |
2143 FirstVal = loOperand(Val); | |
2144 SecondVal = hiOperand(Val); | |
2145 } | |
2146 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, SecondVa l); | |
Jim Stichnoth
2014/07/15 22:08:09
80-col
jvoung (off chromium)
2014/07/15 23:10:33
Done.
| |
2147 return; | |
2148 } | |
2149 case Intrinsics::Cttz: { | |
2150 // The "is zero undef" parameter is ignored and we always return | |
2151 // a well-defined value. | |
2152 const bool IsCttz = true; | |
2153 Operand *Val = legalize(Instr->getArg(0)); | |
2154 Operand *FirstVal; | |
2155 Operand *SecondVal = NULL; | |
2156 if (Val->getType() == IceType_i32) { | |
2157 FirstVal = Val; | |
2158 } else { | |
2159 FirstVal = hiOperand(Val); | |
2160 SecondVal = loOperand(Val); | |
2161 } | |
2162 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, SecondVa l); | |
Jim Stichnoth
2014/07/15 22:08:09
80-col
jvoung (off chromium)
2014/07/15 23:10:33
Done.
| |
2163 return; | |
2164 } | |
2118 case Intrinsics::Longjmp: { | 2165 case Intrinsics::Longjmp: { |
2119 InstCall *Call = makeHelperCall("longjmp", NULL, 2); | 2166 InstCall *Call = makeHelperCall("longjmp", NULL, 2); |
2120 Call->addArg(Instr->getArg(0)); | 2167 Call->addArg(Instr->getArg(0)); |
2121 Call->addArg(Instr->getArg(1)); | 2168 Call->addArg(Instr->getArg(1)); |
2122 lowerCall(Call); | 2169 lowerCall(Call); |
2123 return; | 2170 return; |
2124 } | 2171 } |
2125 case Intrinsics::Memcpy: { | 2172 case Intrinsics::Memcpy: { |
2126 // In the future, we could potentially emit an inline memcpy/memset, etc. | 2173 // In the future, we could potentially emit an inline memcpy/memset, etc. |
2127 // for intrinsic calls w/ a known length. | 2174 // for intrinsic calls w/ a known length. |
(...skipping 273 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2401 // If Val is a variable, model the extended live range of Val through | 2448 // If Val is a variable, model the extended live range of Val through |
2402 // the end of the loop, since it will be re-used by the loop. | 2449 // the end of the loop, since it will be re-used by the loop. |
2403 if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) { | 2450 if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) { |
2404 Context.insert(InstFakeUse::create(Func, ValVar)); | 2451 Context.insert(InstFakeUse::create(Func, ValVar)); |
2405 } | 2452 } |
2406 // The address base is also reused in the loop. | 2453 // The address base is also reused in the loop. |
2407 Context.insert(InstFakeUse::create(Func, Addr->getBase())); | 2454 Context.insert(InstFakeUse::create(Func, Addr->getBase())); |
2408 _mov(Dest, T_eax); | 2455 _mov(Dest, T_eax); |
2409 } | 2456 } |
2410 | 2457 |
2458 // Lowers count {trailing, leading} zeros intrinsic. | |
2459 // | |
2460 // We could do constant folding here, but that should have | |
2461 // been done by the front-end/middle-end optimizations. | |
2462 void TargetX8632::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, | |
2463 Operand *FirstVal, Operand *SecondVal) { | |
2464 // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI). | |
2465 // Then the instructions will handle the Val == 0 case much more simply | |
2466 // and won't require conversion from bit position to number of zeros. | |
2467 // | |
2468 // Otherwise: | |
2469 // bsr IF_NOT_ZERO, Val | |
2470 // mov T_DEST, 63 | |
2471 // cmovne T_DEST, IF_NOT_ZERO | |
2472 // xor T_DEST, 31 | |
2473 // mov DEST, T_DEST | |
2474 // | |
2475 // NOTE: T_DEST must be a register because cmov requires its dest to be a | |
2476 // register. Also, bsf and bsr require their dest to be a register. | |
2477 // | |
2478 // The xor DEST, 31 converts a bit position to # of leading zeroes. | |
2479 // E.g., for 000... 00001100, bsr will say that the most significant bit | |
2480 // set is at position 3, while the number of leading zeros is 28. Xor is | |
2481 // like (31 - N) for N <= 31, and converts 63 to 32 (for the all-zeros case). | |
2482 // | |
2483 // Similar for 64-bit, but start w/ speculating that the upper 32 bits | |
2484 // are all zero, and compute the result for that case (checking the lower | |
2485 // 32 bits). Then actually compute the result for the upper bits and | |
2486 // cmov in the result from the lower computation if the earlier speculation | |
2487 // was correct. | |
2488 // | |
2489 // Cttz, is similar, but uses bsf instead, and doesn't require the xor | |
2490 // bit position conversion, and the speculation is reversed. | |
2491 assert(Ty == IceType_i32 || Ty == IceType_i64); | |
2492 Variable *T = makeReg(IceType_i32); | |
2493 if (Cttz) { | |
2494 _bsf(T, FirstVal); | |
2495 } else { | |
2496 _bsr(T, FirstVal); | |
2497 } | |
2498 Variable *T_Dest = makeReg(IceType_i32); | |
2499 Constant *ThirtyTwo = Ctx->getConstantInt(IceType_i32, 32); | |
2500 Constant *ThirtyOne = Ctx->getConstantInt(IceType_i32, 31); | |
2501 if (Cttz) { | |
2502 _mov(T_Dest, ThirtyTwo); | |
2503 } else { | |
2504 Constant *SixtyThree = Ctx->getConstantInt(IceType_i32, 63); | |
2505 _mov(T_Dest, SixtyThree); | |
2506 } | |
2507 _cmov(T_Dest, T, InstX8632::Br_ne); | |
2508 if (!Cttz) { | |
2509 _xor(T_Dest, ThirtyOne); | |
2510 } | |
2511 if (Ty == IceType_i32) { | |
2512 _mov(Dest, T_Dest); | |
2513 return; | |
2514 } | |
2515 _add(T_Dest, ThirtyTwo); | |
2516 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); | |
2517 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest)); | |
2518 // Will be using "test" on this, so we need a registerized variable. | |
2519 Variable *SecondVar = legalizeToVar(SecondVal); | |
2520 Variable *T_Dest2 = makeReg(IceType_i32); | |
2521 if (Cttz) { | |
2522 _bsf(T_Dest2, SecondVar); | |
2523 } else { | |
2524 _bsr(T_Dest2, SecondVar); | |
2525 _xor(T_Dest2, ThirtyOne); | |
2526 } | |
2527 _test(SecondVar, SecondVar); | |
2528 _cmov(T_Dest2, T_Dest, InstX8632::Br_e); | |
2529 _mov(DestLo, T_Dest2); | |
2530 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); | |
2531 } | |
2532 | |
2411 namespace { | 2533 namespace { |
2412 | 2534 |
2413 bool isAdd(const Inst *Inst) { | 2535 bool isAdd(const Inst *Inst) { |
2414 if (const InstArithmetic *Arith = | 2536 if (const InstArithmetic *Arith = |
2415 llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) { | 2537 llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) { |
2416 return (Arith->getOp() == InstArithmetic::Add); | 2538 return (Arith->getOp() == InstArithmetic::Add); |
2417 } | 2539 } |
2418 return false; | 2540 return false; |
2419 } | 2541 } |
2420 | 2542 |
(...skipping 654 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3075 for (SizeT i = 0; i < Size; ++i) { | 3197 for (SizeT i = 0; i < Size; ++i) { |
3076 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n"; | 3198 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n"; |
3077 } | 3199 } |
3078 Str << "\t.size\t" << MangledName << ", " << Size << "\n"; | 3200 Str << "\t.size\t" << MangledName << ", " << Size << "\n"; |
3079 } | 3201 } |
3080 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName | 3202 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName |
3081 << "\n"; | 3203 << "\n"; |
3082 } | 3204 } |
3083 | 3205 |
3084 } // end of namespace Ice | 3206 } // end of namespace Ice |
OLD | NEW |