Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(411)

Side by Side Diff: src/IceTargetLoweringX8632.cpp

Issue 390443005: Lower bitmanip intrinsics, assuming absence of BMI/SSE4.2 for now. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master
Patch Set: try to merge the two Created 6 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/IceTargetLoweringX8632.h ('k') | tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===// 1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//
2 // 2 //
3 // The Subzero Code Generator 3 // The Subzero Code Generator
4 // 4 //
5 // This file is distributed under the University of Illinois Open Source 5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details. 6 // License. See LICENSE.TXT for details.
7 // 7 //
8 //===----------------------------------------------------------------------===// 8 //===----------------------------------------------------------------------===//
9 // 9 //
10 // This file implements the TargetLoweringX8632 class, which 10 // This file implements the TargetLoweringX8632 class, which
(...skipping 21 matching lines...) Expand all
32 // lowerFcmp() describes the lowering template. In the most general case, there 32 // lowerFcmp() describes the lowering template. In the most general case, there
33 // is a compare followed by two conditional branches, because some fcmp 33 // is a compare followed by two conditional branches, because some fcmp
34 // conditions don't map to a single x86 conditional branch. However, in many 34 // conditions don't map to a single x86 conditional branch. However, in many
35 // cases it is possible to swap the operands in the comparison and have a single 35 // cases it is possible to swap the operands in the comparison and have a single
36 // conditional branch. Since it's quite tedious to validate the table by hand, 36 // conditional branch. Since it's quite tedious to validate the table by hand,
37 // good execution tests are helpful. 37 // good execution tests are helpful.
38 38
39 const struct TableFcmp_ { 39 const struct TableFcmp_ {
40 uint32_t Default; 40 uint32_t Default;
41 bool SwapOperands; 41 bool SwapOperands;
42 InstX8632Br::BrCond C1, C2; 42 InstX8632::BrCond C1, C2;
43 } TableFcmp[] = { 43 } TableFcmp[] = {
44 #define X(val, dflt, swap, C1, C2) \ 44 #define X(val, dflt, swap, C1, C2) \
45 { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 } \ 45 { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 } \
46 , 46 ,
47 FCMPX8632_TABLE 47 FCMPX8632_TABLE
48 #undef X 48 #undef X
49 }; 49 };
50 const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp); 50 const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp);
51 51
52 // The following table summarizes the logic for lowering the icmp instruction 52 // The following table summarizes the logic for lowering the icmp instruction
53 // for i32 and narrower types. Each icmp condition has a clear mapping to an 53 // for i32 and narrower types. Each icmp condition has a clear mapping to an
54 // x86 conditional branch instruction. 54 // x86 conditional branch instruction.
55 55
56 const struct TableIcmp32_ { 56 const struct TableIcmp32_ {
57 InstX8632Br::BrCond Mapping; 57 InstX8632::BrCond Mapping;
58 } TableIcmp32[] = { 58 } TableIcmp32[] = {
59 #define X(val, C_32, C1_64, C2_64, C3_64) \ 59 #define X(val, C_32, C1_64, C2_64, C3_64) \
60 { InstX8632Br::C_32 } \ 60 { InstX8632Br::C_32 } \
61 , 61 ,
62 ICMPX8632_TABLE 62 ICMPX8632_TABLE
63 #undef X 63 #undef X
64 }; 64 };
65 const size_t TableIcmp32Size = llvm::array_lengthof(TableIcmp32); 65 const size_t TableIcmp32Size = llvm::array_lengthof(TableIcmp32);
66 66
67 // The following table summarizes the logic for lowering the icmp instruction 67 // The following table summarizes the logic for lowering the icmp instruction
68 // for the i64 type. For Eq and Ne, two separate 32-bit comparisons and 68 // for the i64 type. For Eq and Ne, two separate 32-bit comparisons and
69 // conditional branches are needed. For the other conditions, three separate 69 // conditional branches are needed. For the other conditions, three separate
70 // conditional branches are needed. 70 // conditional branches are needed.
71 const struct TableIcmp64_ { 71 const struct TableIcmp64_ {
72 InstX8632Br::BrCond C1, C2, C3; 72 InstX8632::BrCond C1, C2, C3;
73 } TableIcmp64[] = { 73 } TableIcmp64[] = {
74 #define X(val, C_32, C1_64, C2_64, C3_64) \ 74 #define X(val, C_32, C1_64, C2_64, C3_64) \
75 { InstX8632Br::C1_64, InstX8632Br::C2_64, InstX8632Br::C3_64 } \ 75 { InstX8632Br::C1_64, InstX8632Br::C2_64, InstX8632Br::C3_64 } \
76 , 76 ,
77 ICMPX8632_TABLE 77 ICMPX8632_TABLE
78 #undef X 78 #undef X
79 }; 79 };
80 const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64); 80 const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64);
81 81
82 InstX8632Br::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) { 82 InstX8632::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {
83 size_t Index = static_cast<size_t>(Cond); 83 size_t Index = static_cast<size_t>(Cond);
84 assert(Index < TableIcmp32Size); 84 assert(Index < TableIcmp32Size);
85 return TableIcmp32[Index].Mapping; 85 return TableIcmp32[Index].Mapping;
86 } 86 }
87 87
88 // The maximum number of arguments to pass in XMM registers 88 // The maximum number of arguments to pass in XMM registers
89 const unsigned X86_MAX_XMM_ARGS = 4; 89 const unsigned X86_MAX_XMM_ARGS = 4;
90 90
91 // In some cases, there are x-macros tables for both high-level and 91 // In some cases, there are x-macros tables for both high-level and
92 // low-level instructions/operands that use the same enum key value. 92 // low-level instructions/operands that use the same enum key value.
(...skipping 2009 matching lines...) Expand 10 before | Expand all | Expand 10 after
2102 _storeq(T, Addr); 2102 _storeq(T, Addr);
2103 _mfence(); 2103 _mfence();
2104 return; 2104 return;
2105 } 2105 }
2106 InstStore *Store = InstStore::create(Func, Value, Ptr); 2106 InstStore *Store = InstStore::create(Func, Value, Ptr);
2107 lowerStore(Store); 2107 lowerStore(Store);
2108 _mfence(); 2108 _mfence();
2109 return; 2109 return;
2110 } 2110 }
2111 case Intrinsics::Bswap: 2111 case Intrinsics::Bswap:
2112 case Intrinsics::Ctlz:
2113 case Intrinsics::Ctpop:
2114 case Intrinsics::Cttz:
2115 // TODO(jvoung): fill it in.
2116 Func->setError("Unhandled intrinsic"); 2112 Func->setError("Unhandled intrinsic");
2117 return; 2113 return;
2114 case Intrinsics::Ctpop: {
2115 Variable *Dest = Instr->getDest();
2116 Operand *Val = Instr->getArg(0);
2117 InstCall *Call = makeHelperCall(Val->getType() == IceType_i32 ?
Jim Stichnoth 2014/07/15 22:08:09 Here and below, I'd prefer to make type comparison
jvoung (off chromium) 2014/07/15 23:10:33 Done. Yep, would be good to identify 64-bit speci
2118 "__popcountsi2" : "__popcountdi2", Dest, 1);
2119 Call->addArg(Val);
2120 lowerCall(Call);
2121 // The popcount helpers always return 32-bit values, while the intrinsic's
2122 // signature matches the native POPCNT instruction and fills a 64-bit reg
2123 // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
2124 // the user doesn't do that in the IR. If the user does that in the IR,
2125 // then this zero'ing instruction is dead and gets optimized out.
2126 if (Val->getType() == IceType_i64) {
2127 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2128 Constant *Zero = Ctx->getConstantZero(IceType_i32);
2129 _mov(DestHi, Zero);
2130 }
2131 return;
2132 }
2133 case Intrinsics::Ctlz: {
2134 // The "is zero undef" parameter is ignored and we always return
2135 // a well-defined value.
2136 const bool IsCttz = false;
2137 Operand *Val = legalize(Instr->getArg(0));
2138 Operand *FirstVal;
2139 Operand *SecondVal = NULL;
2140 if (Val->getType() == IceType_i32) {
2141 FirstVal = Val;
2142 } else {
2143 FirstVal = loOperand(Val);
2144 SecondVal = hiOperand(Val);
2145 }
2146 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, SecondVa l);
Jim Stichnoth 2014/07/15 22:08:09 80-col
jvoung (off chromium) 2014/07/15 23:10:33 Done.
2147 return;
2148 }
2149 case Intrinsics::Cttz: {
2150 // The "is zero undef" parameter is ignored and we always return
2151 // a well-defined value.
2152 const bool IsCttz = true;
2153 Operand *Val = legalize(Instr->getArg(0));
2154 Operand *FirstVal;
2155 Operand *SecondVal = NULL;
2156 if (Val->getType() == IceType_i32) {
2157 FirstVal = Val;
2158 } else {
2159 FirstVal = hiOperand(Val);
2160 SecondVal = loOperand(Val);
2161 }
2162 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, SecondVa l);
Jim Stichnoth 2014/07/15 22:08:09 80-col
jvoung (off chromium) 2014/07/15 23:10:33 Done.
2163 return;
2164 }
2118 case Intrinsics::Longjmp: { 2165 case Intrinsics::Longjmp: {
2119 InstCall *Call = makeHelperCall("longjmp", NULL, 2); 2166 InstCall *Call = makeHelperCall("longjmp", NULL, 2);
2120 Call->addArg(Instr->getArg(0)); 2167 Call->addArg(Instr->getArg(0));
2121 Call->addArg(Instr->getArg(1)); 2168 Call->addArg(Instr->getArg(1));
2122 lowerCall(Call); 2169 lowerCall(Call);
2123 return; 2170 return;
2124 } 2171 }
2125 case Intrinsics::Memcpy: { 2172 case Intrinsics::Memcpy: {
2126 // In the future, we could potentially emit an inline memcpy/memset, etc. 2173 // In the future, we could potentially emit an inline memcpy/memset, etc.
2127 // for intrinsic calls w/ a known length. 2174 // for intrinsic calls w/ a known length.
(...skipping 273 matching lines...) Expand 10 before | Expand all | Expand 10 after
2401 // If Val is a variable, model the extended live range of Val through 2448 // If Val is a variable, model the extended live range of Val through
2402 // the end of the loop, since it will be re-used by the loop. 2449 // the end of the loop, since it will be re-used by the loop.
2403 if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) { 2450 if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {
2404 Context.insert(InstFakeUse::create(Func, ValVar)); 2451 Context.insert(InstFakeUse::create(Func, ValVar));
2405 } 2452 }
2406 // The address base is also reused in the loop. 2453 // The address base is also reused in the loop.
2407 Context.insert(InstFakeUse::create(Func, Addr->getBase())); 2454 Context.insert(InstFakeUse::create(Func, Addr->getBase()));
2408 _mov(Dest, T_eax); 2455 _mov(Dest, T_eax);
2409 } 2456 }
2410 2457
2458 // Lowers count {trailing, leading} zeros intrinsic.
2459 //
2460 // We could do constant folding here, but that should have
2461 // been done by the front-end/middle-end optimizations.
2462 void TargetX8632::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
2463 Operand *FirstVal, Operand *SecondVal) {
2464 // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
2465 // Then the instructions will handle the Val == 0 case much more simply
2466 // and won't require conversion from bit position to number of zeros.
2467 //
2468 // Otherwise:
2469 // bsr IF_NOT_ZERO, Val
2470 // mov T_DEST, 63
2471 // cmovne T_DEST, IF_NOT_ZERO
2472 // xor T_DEST, 31
2473 // mov DEST, T_DEST
2474 //
2475 // NOTE: T_DEST must be a register because cmov requires its dest to be a
2476 // register. Also, bsf and bsr require their dest to be a register.
2477 //
2478 // The xor DEST, 31 converts a bit position to # of leading zeroes.
2479 // E.g., for 000... 00001100, bsr will say that the most significant bit
2480 // set is at position 3, while the number of leading zeros is 28. Xor is
2481 // like (31 - N) for N <= 31, and converts 63 to 32 (for the all-zeros case).
2482 //
2483 // Similar for 64-bit, but start w/ speculating that the upper 32 bits
2484 // are all zero, and compute the result for that case (checking the lower
2485 // 32 bits). Then actually compute the result for the upper bits and
2486 // cmov in the result from the lower computation if the earlier speculation
2487 // was correct.
2488 //
2489 // Cttz, is similar, but uses bsf instead, and doesn't require the xor
2490 // bit position conversion, and the speculation is reversed.
2491 assert(Ty == IceType_i32 || Ty == IceType_i64);
2492 Variable *T = makeReg(IceType_i32);
2493 if (Cttz) {
2494 _bsf(T, FirstVal);
2495 } else {
2496 _bsr(T, FirstVal);
2497 }
2498 Variable *T_Dest = makeReg(IceType_i32);
2499 Constant *ThirtyTwo = Ctx->getConstantInt(IceType_i32, 32);
2500 Constant *ThirtyOne = Ctx->getConstantInt(IceType_i32, 31);
2501 if (Cttz) {
2502 _mov(T_Dest, ThirtyTwo);
2503 } else {
2504 Constant *SixtyThree = Ctx->getConstantInt(IceType_i32, 63);
2505 _mov(T_Dest, SixtyThree);
2506 }
2507 _cmov(T_Dest, T, InstX8632::Br_ne);
2508 if (!Cttz) {
2509 _xor(T_Dest, ThirtyOne);
2510 }
2511 if (Ty == IceType_i32) {
2512 _mov(Dest, T_Dest);
2513 return;
2514 }
2515 _add(T_Dest, ThirtyTwo);
2516 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
2517 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2518 // Will be using "test" on this, so we need a registerized variable.
2519 Variable *SecondVar = legalizeToVar(SecondVal);
2520 Variable *T_Dest2 = makeReg(IceType_i32);
2521 if (Cttz) {
2522 _bsf(T_Dest2, SecondVar);
2523 } else {
2524 _bsr(T_Dest2, SecondVar);
2525 _xor(T_Dest2, ThirtyOne);
2526 }
2527 _test(SecondVar, SecondVar);
2528 _cmov(T_Dest2, T_Dest, InstX8632::Br_e);
2529 _mov(DestLo, T_Dest2);
2530 _mov(DestHi, Ctx->getConstantZero(IceType_i32));
2531 }
2532
2411 namespace { 2533 namespace {
2412 2534
2413 bool isAdd(const Inst *Inst) { 2535 bool isAdd(const Inst *Inst) {
2414 if (const InstArithmetic *Arith = 2536 if (const InstArithmetic *Arith =
2415 llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) { 2537 llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) {
2416 return (Arith->getOp() == InstArithmetic::Add); 2538 return (Arith->getOp() == InstArithmetic::Add);
2417 } 2539 }
2418 return false; 2540 return false;
2419 } 2541 }
2420 2542
(...skipping 654 matching lines...) Expand 10 before | Expand all | Expand 10 after
3075 for (SizeT i = 0; i < Size; ++i) { 3197 for (SizeT i = 0; i < Size; ++i) {
3076 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n"; 3198 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n";
3077 } 3199 }
3078 Str << "\t.size\t" << MangledName << ", " << Size << "\n"; 3200 Str << "\t.size\t" << MangledName << ", " << Size << "\n";
3079 } 3201 }
3080 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName 3202 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName
3081 << "\n"; 3203 << "\n";
3082 } 3204 }
3083 3205
3084 } // end of namespace Ice 3206 } // end of namespace Ice
OLDNEW
« no previous file with comments | « src/IceTargetLoweringX8632.h ('k') | tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698