src/IceTargetLoweringX8632.cpp - Issue 390443005: Lower bitmanip intrinsics, assuming absence of BMI/SSE4.2 for now.

Side by Side Diff: src/IceTargetLoweringX8632.cpp

Issue 390443005: Lower bitmanip intrinsics, assuming absence of BMI/SSE4.2 for now. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master

Patch Set: review Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff |

OLD	NEW
1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//	1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//

2 //	2 //

3 // The Subzero Code Generator	3 // The Subzero Code Generator

4 //	4 //

5 // This file is distributed under the University of Illinois Open Source	5 // This file is distributed under the University of Illinois Open Source

6 // License. See LICENSE.TXT for details.	6 // License. See LICENSE.TXT for details.

7 //	7 //

8 //===----------------------------------------------------------------------===//	8 //===----------------------------------------------------------------------===//

9 //	9 //

10 // This file implements the TargetLoweringX8632 class, which	10 // This file implements the TargetLoweringX8632 class, which

(...skipping 21 matching lines...) Loading...
32 // lowerFcmp() describes the lowering template. In the most general case, there	32 // lowerFcmp() describes the lowering template. In the most general case, there

33 // is a compare followed by two conditional branches, because some fcmp	33 // is a compare followed by two conditional branches, because some fcmp

34 // conditions don't map to a single x86 conditional branch. However, in many	34 // conditions don't map to a single x86 conditional branch. However, in many

35 // cases it is possible to swap the operands in the comparison and have a single	35 // cases it is possible to swap the operands in the comparison and have a single

36 // conditional branch. Since it's quite tedious to validate the table by hand,	36 // conditional branch. Since it's quite tedious to validate the table by hand,

37 // good execution tests are helpful.	37 // good execution tests are helpful.

38	38

39 const struct TableFcmp_ {	39 const struct TableFcmp_ {

40 uint32_t Default;	40 uint32_t Default;

41 bool SwapOperands;	41 bool SwapOperands;

42 InstX8632Br::BrCond C1, C2;	42 InstX8632::BrCond C1, C2;

43 } TableFcmp[] = {	43 } TableFcmp[] = {

44 #define X(val, dflt, swap, C1, C2) \	44 #define X(val, dflt, swap, C1, C2) \

45 { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 } \	45 { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 } \

46 ,	46 ,

47 FCMPX8632_TABLE	47 FCMPX8632_TABLE

48 #undef X	48 #undef X

49 };	49 };

50 const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp);	50 const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp);

51	51

52 // The following table summarizes the logic for lowering the icmp instruction	52 // The following table summarizes the logic for lowering the icmp instruction

53 // for i32 and narrower types. Each icmp condition has a clear mapping to an	53 // for i32 and narrower types. Each icmp condition has a clear mapping to an

54 // x86 conditional branch instruction.	54 // x86 conditional branch instruction.

55	55

56 const struct TableIcmp32_ {	56 const struct TableIcmp32_ {

57 InstX8632Br::BrCond Mapping;	57 InstX8632::BrCond Mapping;

58 } TableIcmp32[] = {	58 } TableIcmp32[] = {

59 #define X(val, C_32, C1_64, C2_64, C3_64) \	59 #define X(val, C_32, C1_64, C2_64, C3_64) \

60 { InstX8632Br::C_32 } \	60 { InstX8632Br::C_32 } \

61 ,	61 ,

62 ICMPX8632_TABLE	62 ICMPX8632_TABLE

63 #undef X	63 #undef X

64 };	64 };

65 const size_t TableIcmp32Size = llvm::array_lengthof(TableIcmp32);	65 const size_t TableIcmp32Size = llvm::array_lengthof(TableIcmp32);

66	66

67 // The following table summarizes the logic for lowering the icmp instruction	67 // The following table summarizes the logic for lowering the icmp instruction

68 // for the i64 type. For Eq and Ne, two separate 32-bit comparisons and	68 // for the i64 type. For Eq and Ne, two separate 32-bit comparisons and

69 // conditional branches are needed. For the other conditions, three separate	69 // conditional branches are needed. For the other conditions, three separate

70 // conditional branches are needed.	70 // conditional branches are needed.

71 const struct TableIcmp64_ {	71 const struct TableIcmp64_ {

72 InstX8632Br::BrCond C1, C2, C3;	72 InstX8632::BrCond C1, C2, C3;

73 } TableIcmp64[] = {	73 } TableIcmp64[] = {

74 #define X(val, C_32, C1_64, C2_64, C3_64) \	74 #define X(val, C_32, C1_64, C2_64, C3_64) \

75 { InstX8632Br::C1_64, InstX8632Br::C2_64, InstX8632Br::C3_64 } \	75 { InstX8632Br::C1_64, InstX8632Br::C2_64, InstX8632Br::C3_64 } \

76 ,	76 ,

77 ICMPX8632_TABLE	77 ICMPX8632_TABLE

78 #undef X	78 #undef X

79 };	79 };

80 const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64);	80 const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64);

81	81

82 InstX8632Br::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {	82 InstX8632::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {

83 size_t Index = static_cast<size_t>(Cond);	83 size_t Index = static_cast<size_t>(Cond);

84 assert(Index < TableIcmp32Size);	84 assert(Index < TableIcmp32Size);

85 return TableIcmp32[Index].Mapping;	85 return TableIcmp32[Index].Mapping;

86 }	86 }

87	87

88 // The maximum number of arguments to pass in XMM registers	88 // The maximum number of arguments to pass in XMM registers

89 const unsigned X86_MAX_XMM_ARGS = 4;	89 const unsigned X86_MAX_XMM_ARGS = 4;

90	90

91 // In some cases, there are x-macros tables for both high-level and	91 // In some cases, there are x-macros tables for both high-level and

92 // low-level instructions/operands that use the same enum key value.	92 // low-level instructions/operands that use the same enum key value.

(...skipping 2009 matching lines...) Loading...
2102 _storeq(T, Addr);	2102 _storeq(T, Addr);

2103 _mfence();	2103 _mfence();

2104 return;	2104 return;

2105 }	2105 }

2106 InstStore *Store = InstStore::create(Func, Value, Ptr);	2106 InstStore *Store = InstStore::create(Func, Value, Ptr);

2107 lowerStore(Store);	2107 lowerStore(Store);

2108 _mfence();	2108 _mfence();

2109 return;	2109 return;

2110 }	2110 }

2111 case Intrinsics::Bswap:	2111 case Intrinsics::Bswap:

2112 case Intrinsics::Ctlz:

2113 case Intrinsics::Ctpop:

2114 case Intrinsics::Cttz:

2115 // TODO(jvoung): fill it in.

2116 Func->setError("Unhandled intrinsic");	2112 Func->setError("Unhandled intrinsic");

2117 return;	2113 return;

	2114 case Intrinsics::Ctpop: {

	2115 Variable *Dest = Instr->getDest();

	2116 Operand *Val = Instr->getArg(0);

	2117 InstCall *Call = makeHelperCall(Val->getType() == IceType_i64 ?

	2118 "__popcountdi2" : "__popcountsi2", Dest, 1);

	2119 Call->addArg(Val);

	2120 lowerCall(Call);

	2121 // The popcount helpers always return 32-bit values, while the intrinsic's

	2122 // signature matches the native POPCNT instruction and fills a 64-bit reg

	2123 // (in 64-bit mode). Thus, clear the upper bits of the dest just in case

	2124 // the user doesn't do that in the IR. If the user does that in the IR,

	2125 // then this zero'ing instruction is dead and gets optimized out.

	2126 if (Val->getType() == IceType_i64) {

	2127 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));

	2128 Constant *Zero = Ctx->getConstantZero(IceType_i32);

	2129 _mov(DestHi, Zero);

	2130 }

	2131 return;

	2132 }

	2133 case Intrinsics::Ctlz: {

	2134 // The "is zero undef" parameter is ignored and we always return

	2135 // a well-defined value.

	2136 Operand *Val = legalize(Instr->getArg(0));

	2137 Operand *FirstVal;

	2138 Operand *SecondVal = NULL;

	2139 if (Val->getType() == IceType_i64) {

	2140 FirstVal = loOperand(Val);

	2141 SecondVal = hiOperand(Val);

	2142 } else {

	2143 FirstVal = Val;

	2144 }

	2145 const bool IsCttz = false;

	2146 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,

	2147 SecondVal);

	2148 return;

	2149 }

	2150 case Intrinsics::Cttz: {

	2151 // The "is zero undef" parameter is ignored and we always return

	2152 // a well-defined value.

	2153 Operand *Val = legalize(Instr->getArg(0));

	2154 Operand *FirstVal;

	2155 Operand *SecondVal = NULL;

	2156 if (Val->getType() == IceType_i64) {

	2157 FirstVal = hiOperand(Val);

	2158 SecondVal = loOperand(Val);

	2159 } else {

	2160 FirstVal = Val;

	2161 }

	2162 const bool IsCttz = true;

	2163 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,

	2164 SecondVal);

	2165 return;

	2166 }

2118 case Intrinsics::Longjmp: {	2167 case Intrinsics::Longjmp: {

2119 InstCall *Call = makeHelperCall("longjmp", NULL, 2);	2168 InstCall *Call = makeHelperCall("longjmp", NULL, 2);

2120 Call->addArg(Instr->getArg(0));	2169 Call->addArg(Instr->getArg(0));

2121 Call->addArg(Instr->getArg(1));	2170 Call->addArg(Instr->getArg(1));

2122 lowerCall(Call);	2171 lowerCall(Call);

2123 return;	2172 return;

2124 }	2173 }

2125 case Intrinsics::Memcpy: {	2174 case Intrinsics::Memcpy: {

2126 // In the future, we could potentially emit an inline memcpy/memset, etc.	2175 // In the future, we could potentially emit an inline memcpy/memset, etc.

2127 // for intrinsic calls w/ a known length.	2176 // for intrinsic calls w/ a known length.

(...skipping 273 matching lines...) Loading...
2401 // If Val is a variable, model the extended live range of Val through	2450 // If Val is a variable, model the extended live range of Val through

2402 // the end of the loop, since it will be re-used by the loop.	2451 // the end of the loop, since it will be re-used by the loop.

2403 if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {	2452 if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {

2404 Context.insert(InstFakeUse::create(Func, ValVar));	2453 Context.insert(InstFakeUse::create(Func, ValVar));

2405 }	2454 }

2406 // The address base is also reused in the loop.	2455 // The address base is also reused in the loop.

2407 Context.insert(InstFakeUse::create(Func, Addr->getBase()));	2456 Context.insert(InstFakeUse::create(Func, Addr->getBase()));

2408 _mov(Dest, T_eax);	2457 _mov(Dest, T_eax);

2409 }	2458 }

2410	2459

	2460 // Lowers count {trailing, leading} zeros intrinsic.

	2461 //

	2462 // We could do constant folding here, but that should have

	2463 // been done by the front-end/middle-end optimizations.

	2464 void TargetX8632::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,

	2465 Operand FirstVal, Operand SecondVal) {

	2466 // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).

	2467 // Then the instructions will handle the Val == 0 case much more simply

	2468 // and won't require conversion from bit position to number of zeros.

	2469 //

	2470 // Otherwise:

	2471 // bsr IF_NOT_ZERO, Val

	2472 // mov T_DEST, 63

	2473 // cmovne T_DEST, IF_NOT_ZERO

	2474 // xor T_DEST, 31

	2475 // mov DEST, T_DEST

	2476 //

	2477 // NOTE: T_DEST must be a register because cmov requires its dest to be a

	2478 // register. Also, bsf and bsr require their dest to be a register.

	2479 //

	2480 // The xor DEST, 31 converts a bit position to # of leading zeroes.

	2481 // E.g., for 000... 00001100, bsr will say that the most significant bit

	2482 // set is at position 3, while the number of leading zeros is 28. Xor is

	2483 // like (31 - N) for N <= 31, and converts 63 to 32 (for the all-zeros case).

	2484 //

	2485 // Similar for 64-bit, but start w/ speculating that the upper 32 bits

	2486 // are all zero, and compute the result for that case (checking the lower

	2487 // 32 bits). Then actually compute the result for the upper bits and

	2488 // cmov in the result from the lower computation if the earlier speculation

	2489 // was correct.

	2490 //

	2491 // Cttz, is similar, but uses bsf instead, and doesn't require the xor

	2492 // bit position conversion, and the speculation is reversed.

	2493 assert(Ty == IceType_i32 \|\| Ty == IceType_i64);

	2494 Variable *T = makeReg(IceType_i32);

	2495 if (Cttz) {

	2496 _bsf(T, FirstVal);

	2497 } else {

	2498 _bsr(T, FirstVal);

	2499 }

	2500 Variable *T_Dest = makeReg(IceType_i32);

	2501 Constant *ThirtyTwo = Ctx->getConstantInt(IceType_i32, 32);

	2502 Constant *ThirtyOne = Ctx->getConstantInt(IceType_i32, 31);

	2503 if (Cttz) {

	2504 _mov(T_Dest, ThirtyTwo);

	2505 } else {

	2506 Constant *SixtyThree = Ctx->getConstantInt(IceType_i32, 63);

	2507 _mov(T_Dest, SixtyThree);

	2508 }

	2509 _cmov(T_Dest, T, InstX8632::Br_ne);

	2510 if (!Cttz) {

	2511 _xor(T_Dest, ThirtyOne);

	2512 }

	2513 if (Ty == IceType_i32) {

	2514 _mov(Dest, T_Dest);

	2515 return;

	2516 }

	2517 _add(T_Dest, ThirtyTwo);

	2518 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));

	2519 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));

	2520 // Will be using "test" on this, so we need a registerized variable.

	2521 Variable *SecondVar = legalizeToVar(SecondVal);

	2522 Variable *T_Dest2 = makeReg(IceType_i32);

	2523 if (Cttz) {

	2524 _bsf(T_Dest2, SecondVar);

	2525 } else {

	2526 _bsr(T_Dest2, SecondVar);

	2527 _xor(T_Dest2, ThirtyOne);

	2528 }

	2529 _test(SecondVar, SecondVar);

	2530 _cmov(T_Dest2, T_Dest, InstX8632::Br_e);

	2531 _mov(DestLo, T_Dest2);

	2532 _mov(DestHi, Ctx->getConstantZero(IceType_i32));

	2533 }

	2534

2411 namespace {	2535 namespace {

2412	2536

2413 bool isAdd(const Inst *Inst) {	2537 bool isAdd(const Inst *Inst) {

2414 if (const InstArithmetic *Arith =	2538 if (const InstArithmetic *Arith =

2415 llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) {	2539 llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) {

2416 return (Arith->getOp() == InstArithmetic::Add);	2540 return (Arith->getOp() == InstArithmetic::Add);

2417 }	2541 }

2418 return false;	2542 return false;

2419 }	2543 }

2420	2544

(...skipping 654 matching lines...) Loading...
3075 for (SizeT i = 0; i < Size; ++i) {	3199 for (SizeT i = 0; i < Size; ++i) {

3076 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n";	3200 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n";

3077 }	3201 }

3078 Str << "\t.size\t" << MangledName << ", " << Size << "\n";	3202 Str << "\t.size\t" << MangledName << ", " << Size << "\n";

3079 }	3203 }

3080 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName	3204 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName

3081 << "\n";	3205 << "\n";

3082 }	3206 }

3083	3207

3084 } // end of namespace Ice	3208 } // end of namespace Ice

OLD	NEW

« no previous file with comments | « src/IceTargetLoweringX8632.h ('k') | tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll » ('j') | no next file with comments »