Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===// | 1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===// |
| 2 // | 2 // |
| 3 // The Subzero Code Generator | 3 // The Subzero Code Generator |
| 4 // | 4 // |
| 5 // This file is distributed under the University of Illinois Open Source | 5 // This file is distributed under the University of Illinois Open Source |
| 6 // License. See LICENSE.TXT for details. | 6 // License. See LICENSE.TXT for details. |
| 7 // | 7 // |
| 8 //===----------------------------------------------------------------------===// | 8 //===----------------------------------------------------------------------===// |
| 9 // | 9 // |
| 10 // This file implements the TargetLoweringX8632 class, which | 10 // This file implements the TargetLoweringX8632 class, which |
| (...skipping 21 matching lines...) Expand all Loading... | |
| 32 // lowerFcmp() describes the lowering template. In the most general case, there | 32 // lowerFcmp() describes the lowering template. In the most general case, there |
| 33 // is a compare followed by two conditional branches, because some fcmp | 33 // is a compare followed by two conditional branches, because some fcmp |
| 34 // conditions don't map to a single x86 conditional branch. However, in many | 34 // conditions don't map to a single x86 conditional branch. However, in many |
| 35 // cases it is possible to swap the operands in the comparison and have a single | 35 // cases it is possible to swap the operands in the comparison and have a single |
| 36 // conditional branch. Since it's quite tedious to validate the table by hand, | 36 // conditional branch. Since it's quite tedious to validate the table by hand, |
| 37 // good execution tests are helpful. | 37 // good execution tests are helpful. |
| 38 | 38 |
| 39 const struct TableFcmp_ { | 39 const struct TableFcmp_ { |
| 40 uint32_t Default; | 40 uint32_t Default; |
| 41 bool SwapOperands; | 41 bool SwapOperands; |
| 42 InstX8632Br::BrCond C1, C2; | 42 InstX8632::BrCond C1, C2; |
| 43 } TableFcmp[] = { | 43 } TableFcmp[] = { |
| 44 #define X(val, dflt, swap, C1, C2) \ | 44 #define X(val, dflt, swap, C1, C2) \ |
| 45 { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 } \ | 45 { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 } \ |
| 46 , | 46 , |
| 47 FCMPX8632_TABLE | 47 FCMPX8632_TABLE |
| 48 #undef X | 48 #undef X |
| 49 }; | 49 }; |
| 50 const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp); | 50 const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp); |
| 51 | 51 |
| 52 // The following table summarizes the logic for lowering the icmp instruction | 52 // The following table summarizes the logic for lowering the icmp instruction |
| 53 // for i32 and narrower types. Each icmp condition has a clear mapping to an | 53 // for i32 and narrower types. Each icmp condition has a clear mapping to an |
| 54 // x86 conditional branch instruction. | 54 // x86 conditional branch instruction. |
| 55 | 55 |
| 56 const struct TableIcmp32_ { | 56 const struct TableIcmp32_ { |
| 57 InstX8632Br::BrCond Mapping; | 57 InstX8632::BrCond Mapping; |
| 58 } TableIcmp32[] = { | 58 } TableIcmp32[] = { |
| 59 #define X(val, C_32, C1_64, C2_64, C3_64) \ | 59 #define X(val, C_32, C1_64, C2_64, C3_64) \ |
| 60 { InstX8632Br::C_32 } \ | 60 { InstX8632Br::C_32 } \ |
| 61 , | 61 , |
| 62 ICMPX8632_TABLE | 62 ICMPX8632_TABLE |
| 63 #undef X | 63 #undef X |
| 64 }; | 64 }; |
| 65 const size_t TableIcmp32Size = llvm::array_lengthof(TableIcmp32); | 65 const size_t TableIcmp32Size = llvm::array_lengthof(TableIcmp32); |
| 66 | 66 |
| 67 // The following table summarizes the logic for lowering the icmp instruction | 67 // The following table summarizes the logic for lowering the icmp instruction |
| 68 // for the i64 type. For Eq and Ne, two separate 32-bit comparisons and | 68 // for the i64 type. For Eq and Ne, two separate 32-bit comparisons and |
| 69 // conditional branches are needed. For the other conditions, three separate | 69 // conditional branches are needed. For the other conditions, three separate |
| 70 // conditional branches are needed. | 70 // conditional branches are needed. |
| 71 const struct TableIcmp64_ { | 71 const struct TableIcmp64_ { |
| 72 InstX8632Br::BrCond C1, C2, C3; | 72 InstX8632::BrCond C1, C2, C3; |
| 73 } TableIcmp64[] = { | 73 } TableIcmp64[] = { |
| 74 #define X(val, C_32, C1_64, C2_64, C3_64) \ | 74 #define X(val, C_32, C1_64, C2_64, C3_64) \ |
| 75 { InstX8632Br::C1_64, InstX8632Br::C2_64, InstX8632Br::C3_64 } \ | 75 { InstX8632Br::C1_64, InstX8632Br::C2_64, InstX8632Br::C3_64 } \ |
| 76 , | 76 , |
| 77 ICMPX8632_TABLE | 77 ICMPX8632_TABLE |
| 78 #undef X | 78 #undef X |
| 79 }; | 79 }; |
| 80 const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64); | 80 const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64); |
| 81 | 81 |
| 82 InstX8632Br::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) { | 82 InstX8632::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) { |
| 83 size_t Index = static_cast<size_t>(Cond); | 83 size_t Index = static_cast<size_t>(Cond); |
| 84 assert(Index < TableIcmp32Size); | 84 assert(Index < TableIcmp32Size); |
| 85 return TableIcmp32[Index].Mapping; | 85 return TableIcmp32[Index].Mapping; |
| 86 } | 86 } |
| 87 | 87 |
| 88 // The maximum number of arguments to pass in XMM registers | 88 // The maximum number of arguments to pass in XMM registers |
| 89 const unsigned X86_MAX_XMM_ARGS = 4; | 89 const unsigned X86_MAX_XMM_ARGS = 4; |
| 90 | 90 |
| 91 // In some cases, there are x-macros tables for both high-level and | 91 // In some cases, there are x-macros tables for both high-level and |
| 92 // low-level instructions/operands that use the same enum key value. | 92 // low-level instructions/operands that use the same enum key value. |
| (...skipping 2008 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2101 _storeq(T, Addr); | 2101 _storeq(T, Addr); |
| 2102 _mfence(); | 2102 _mfence(); |
| 2103 return; | 2103 return; |
| 2104 } | 2104 } |
| 2105 InstStore *Store = InstStore::create(Func, Value, Ptr); | 2105 InstStore *Store = InstStore::create(Func, Value, Ptr); |
| 2106 lowerStore(Store); | 2106 lowerStore(Store); |
| 2107 _mfence(); | 2107 _mfence(); |
| 2108 return; | 2108 return; |
| 2109 } | 2109 } |
| 2110 case Intrinsics::Bswap: | 2110 case Intrinsics::Bswap: |
| 2111 Func->setError("Unhandled intrinsic"); | |
| 2112 return; | |
| 2113 case Intrinsics::Ctpop: { | |
| 2114 Variable *Dest = Instr->getDest(); | |
| 2115 Operand *Val = Instr->getArg(0); | |
| 2116 InstCall *Call = makeHelperCall(Val->getType() == IceType_i32 ? | |
| 2117 "__popcountsi2" : "__popcountdi2", Dest, 1); | |
| 2118 Call->addArg(Val); | |
| 2119 lowerCall(Call); | |
| 2120 // The popcount helpers always return 32-bit values, while the intrinsic's | |
| 2121 // signature matches the native POPCNT instruction and fills a 64-bit reg | |
| 2122 // (in 64-bit mode). Thus, clear the upper bits of the dest just in case | |
| 2123 // the user doesn't do that in the IR. If the user does that in the IR, | |
| 2124 // then this zero'ing instruction is dead and gets optimized out. | |
| 2125 if (Val->getType() == IceType_i64) { | |
| 2126 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest)); | |
| 2127 Constant *Zero = Ctx->getConstantZero(IceType_i32); | |
| 2128 _mov(DestHi, Zero); | |
| 2129 } | |
| 2130 return; | |
| 2131 } | |
| 2111 case Intrinsics::Ctlz: | 2132 case Intrinsics::Ctlz: |
| 2112 case Intrinsics::Ctpop: | 2133 // The "is zero undef" parameter is ignored and we always return |
| 2134 // a well-defined value. | |
| 2135 lowerCtlz(Instr->getDest(), Instr->getArg(0)); | |
| 2136 return; | |
| 2113 case Intrinsics::Cttz: | 2137 case Intrinsics::Cttz: |
| 2114 // TODO(jvoung): fill it in. | 2138 // The "is zero undef" parameter is ignored and we always return |
| 2115 Func->setError("Unhandled intrinsic"); | 2139 // a well-defined value. |
| 2140 lowerCttz(Instr->getDest(), Instr->getArg(0)); | |
| 2116 return; | 2141 return; |
| 2117 case Intrinsics::Longjmp: { | 2142 case Intrinsics::Longjmp: { |
| 2118 InstCall *Call = makeHelperCall("longjmp", NULL, 2); | 2143 InstCall *Call = makeHelperCall("longjmp", NULL, 2); |
| 2119 Call->addArg(Instr->getArg(0)); | 2144 Call->addArg(Instr->getArg(0)); |
| 2120 Call->addArg(Instr->getArg(1)); | 2145 Call->addArg(Instr->getArg(1)); |
| 2121 lowerCall(Call); | 2146 lowerCall(Call); |
| 2122 return; | 2147 return; |
| 2123 } | 2148 } |
| 2124 case Intrinsics::Memcpy: { | 2149 case Intrinsics::Memcpy: { |
| 2125 // In the future, we could potentially emit an inline memcpy/memset, etc. | 2150 // In the future, we could potentially emit an inline memcpy/memset, etc. |
| (...skipping 274 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2400 // If Val is a variable, model the extended live range of Val through | 2425 // If Val is a variable, model the extended live range of Val through |
| 2401 // the end of the loop, since it will be re-used by the loop. | 2426 // the end of the loop, since it will be re-used by the loop. |
| 2402 if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) { | 2427 if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) { |
| 2403 Context.insert(InstFakeUse::create(Func, ValVar)); | 2428 Context.insert(InstFakeUse::create(Func, ValVar)); |
| 2404 } | 2429 } |
| 2405 // The address base is also reused in the loop. | 2430 // The address base is also reused in the loop. |
| 2406 Context.insert(InstFakeUse::create(Func, Addr->getBase())); | 2431 Context.insert(InstFakeUse::create(Func, Addr->getBase())); |
| 2407 _mov(Dest, T_eax); | 2432 _mov(Dest, T_eax); |
| 2408 } | 2433 } |
| 2409 | 2434 |
| 2435 // We could do constant folding here, but that should have | |
| 2436 // been done by the front-end/middle-end optimizations. | |
| 2437 void TargetX8632::lowerCtlz(Variable *Dest, Operand *Val) { | |
| 2438 // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI). | |
| 2439 // Then the instructions will handle the Val == 0 case much more simply | |
| 2440 // and won't require conversion from bit position to number of zeros. | |
| 2441 // | |
| 2442 // Otherwise: | |
| 2443 // bsr IF_NOT_ZERO, Val | |
| 2444 // mov DEST, 63 | |
| 2445 // cmovne DEST, IF_NOT_ZERO | |
|
Jim Stichnoth
2014/07/14 23:20:45
Might want to document that the cmov instruction r
jvoung (off chromium)
2014/07/15 21:30:23
Done.
I think the cvt instructions and some/most
| |
| 2446 // xor DEST, 31 | |
| 2447 // | |
| 2448 // The xor DEST, 31 converts a bit position to # of leading zeroes. | |
| 2449 // E.g., for 000... 00001100, bsr will say that the most significant bit | |
| 2450 // set is at position 3, while the number of leading zeros is 28. Xor is | |
| 2451 // like (31 - N) for N <= 31, and converts 63 to 32 (for the all-zeros case). | |
| 2452 // | |
| 2453 // Similar for 64-bit, but start w/ speculating that the upper 32 bits | |
| 2454 // are all zero, and compute the result for that case (checking the lower | |
| 2455 // 32 bits). Then actually compute the result for the upper bits and | |
| 2456 // cmov in the result from the lower computation if the earlier speculation | |
| 2457 // was correct. | |
| 2458 Type Ty = Val->getType(); | |
| 2459 assert(Ty == IceType_i32 || Ty == IceType_i64); | |
| 2460 Val = legalize(Val); | |
| 2461 Operand *FirstVal = Ty == IceType_i32 ? Val : loOperand(Val); | |
| 2462 Variable *T = makeReg(IceType_i32); | |
| 2463 _bsr(T, FirstVal); | |
| 2464 Variable *T_Dest = makeReg(IceType_i32); | |
| 2465 Constant *SixtyThree = Ctx->getConstantInt(IceType_i32, 63); | |
| 2466 Constant *ThirtyOne = Ctx->getConstantInt(IceType_i32, 31); | |
| 2467 _mov(T_Dest, SixtyThree); | |
| 2468 _cmov(T_Dest, T, InstX8632::Br_ne); | |
| 2469 _xor(T_Dest, ThirtyOne); | |
| 2470 if (Ty == IceType_i32) { | |
| 2471 _mov(Dest, T_Dest); | |
| 2472 return; | |
| 2473 } | |
| 2474 Constant *ThirtyTwo = Ctx->getConstantInt(IceType_i32, 32); | |
| 2475 _add(T_Dest, ThirtyTwo); | |
| 2476 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); | |
| 2477 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest)); | |
| 2478 // Will be using "test" on this, so we need a registerized version. | |
| 2479 Variable *HigherVar = legalizeToVar(hiOperand(Val)); | |
| 2480 Variable *T_Dest2 = makeReg(IceType_i32); | |
| 2481 _bsr(T_Dest2, HigherVar); | |
| 2482 _xor(T_Dest2, ThirtyOne); | |
| 2483 _test(HigherVar, HigherVar); | |
| 2484 _cmov(T_Dest2, T_Dest, InstX8632::Br_e); | |
| 2485 _mov(DestLo, T_Dest2); | |
| 2486 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); | |
| 2487 } | |
| 2488 | |
| 2489 void TargetX8632::lowerCttz(Variable *Dest, Operand *Val) { | |
| 2490 // TODO(jvoung): Determine if the user CPU supports TZCNT (BMI). | |
|
Jim Stichnoth
2014/07/14 23:20:45
Would it make sense to refactor this and lowerCtlz
jvoung (off chromium)
2014/07/15 21:30:23
Hmm, I tried that in the new patch set.
There are
| |
| 2491 // Then the instructions will handle the Val == 0 case much more simply. | |
| 2492 // | |
| 2493 // Otherwise: | |
| 2494 // bsf IF_NOT_ZERO, Val | |
| 2495 // mov IF_ZERO, 32 | |
| 2496 // cmovne IF_ZERO, IF_NOT_ZERO | |
| 2497 // | |
| 2498 // Similar for 64-bit, but start w/ speculating that the bottom 32 bits | |
| 2499 // are all zero. | |
| 2500 Type Ty = Val->getType(); | |
| 2501 assert(Ty == IceType_i32 || Ty == IceType_i64); | |
| 2502 Val = legalize(Val); | |
| 2503 Operand *FirstVal = Ty == IceType_i32 ? Val : hiOperand(Val); | |
| 2504 Variable *T = makeReg(IceType_i32); | |
| 2505 _bsf(T, FirstVal); | |
| 2506 Variable *T_Dest = makeReg(IceType_i32); | |
| 2507 Constant *ThirtyTwo = Ctx->getConstantInt(IceType_i32, 32); | |
| 2508 _mov(T_Dest, ThirtyTwo); | |
| 2509 _cmov(T_Dest, T, InstX8632::Br_ne); | |
| 2510 if (Ty == IceType_i32) { | |
| 2511 _mov(Dest, T_Dest); | |
| 2512 return; | |
| 2513 } | |
| 2514 _add(T_Dest, ThirtyTwo); | |
| 2515 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); | |
| 2516 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest)); | |
| 2517 // Will be using "test" on this, so we need a registerized version. | |
| 2518 Variable *LowerVar = legalizeToVar(loOperand(Val)); | |
| 2519 Variable *T_Dest2 = makeReg(IceType_i32); | |
| 2520 _bsf(T_Dest2, LowerVar); | |
| 2521 _test(LowerVar, LowerVar); | |
| 2522 _cmov(T_Dest2, T_Dest, InstX8632::Br_e); | |
| 2523 _mov(DestLo, T_Dest2); | |
| 2524 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); | |
| 2525 } | |
| 2526 | |
| 2410 namespace { | 2527 namespace { |
| 2411 | 2528 |
| 2412 bool isAdd(const Inst *Inst) { | 2529 bool isAdd(const Inst *Inst) { |
| 2413 if (const InstArithmetic *Arith = | 2530 if (const InstArithmetic *Arith = |
| 2414 llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) { | 2531 llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) { |
| 2415 return (Arith->getOp() == InstArithmetic::Add); | 2532 return (Arith->getOp() == InstArithmetic::Add); |
| 2416 } | 2533 } |
| 2417 return false; | 2534 return false; |
| 2418 } | 2535 } |
| 2419 | 2536 |
| (...skipping 649 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3069 for (SizeT i = 0; i < Size; ++i) { | 3186 for (SizeT i = 0; i < Size; ++i) { |
| 3070 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n"; | 3187 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n"; |
| 3071 } | 3188 } |
| 3072 Str << "\t.size\t" << MangledName << ", " << Size << "\n"; | 3189 Str << "\t.size\t" << MangledName << ", " << Size << "\n"; |
| 3073 } | 3190 } |
| 3074 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName | 3191 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName |
| 3075 << "\n"; | 3192 << "\n"; |
| 3076 } | 3193 } |
| 3077 | 3194 |
| 3078 } // end of namespace Ice | 3195 } // end of namespace Ice |
| OLD | NEW |