src/IceTargetLoweringX8632.cpp - Issue 390443005: Lower bitmanip intrinsics, assuming absence of BMI/SSE4.2 for now.

Side by Side Diff: src/IceTargetLoweringX8632.cpp

Issue 390443005: Lower bitmanip intrinsics, assuming absence of BMI/SSE4.2 for now. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master

Patch Set: try to merge the two Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//	1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//

2 //	2 //

3 // The Subzero Code Generator	3 // The Subzero Code Generator

4 //	4 //

5 // This file is distributed under the University of Illinois Open Source	5 // This file is distributed under the University of Illinois Open Source

6 // License. See LICENSE.TXT for details.	6 // License. See LICENSE.TXT for details.

7 //	7 //

8 //===----------------------------------------------------------------------===//	8 //===----------------------------------------------------------------------===//

9 //	9 //

10 // This file implements the TargetLoweringX8632 class, which	10 // This file implements the TargetLoweringX8632 class, which

(...skipping 21 matching lines...) Expand all Loading...
32 // lowerFcmp() describes the lowering template. In the most general case, there	32 // lowerFcmp() describes the lowering template. In the most general case, there

33 // is a compare followed by two conditional branches, because some fcmp	33 // is a compare followed by two conditional branches, because some fcmp

34 // conditions don't map to a single x86 conditional branch. However, in many	34 // conditions don't map to a single x86 conditional branch. However, in many

35 // cases it is possible to swap the operands in the comparison and have a single	35 // cases it is possible to swap the operands in the comparison and have a single

36 // conditional branch. Since it's quite tedious to validate the table by hand,	36 // conditional branch. Since it's quite tedious to validate the table by hand,

37 // good execution tests are helpful.	37 // good execution tests are helpful.

38	38

39 const struct TableFcmp_ {	39 const struct TableFcmp_ {

40 uint32_t Default;	40 uint32_t Default;

41 bool SwapOperands;	41 bool SwapOperands;

42 InstX8632Br::BrCond C1, C2;	42 InstX8632::BrCond C1, C2;

43 } TableFcmp[] = {	43 } TableFcmp[] = {

44 #define X(val, dflt, swap, C1, C2) \	44 #define X(val, dflt, swap, C1, C2) \

45 { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 } \	45 { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 } \

46 ,	46 ,

47 FCMPX8632_TABLE	47 FCMPX8632_TABLE

48 #undef X	48 #undef X

49 };	49 };

50 const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp);	50 const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp);

51	51

52 // The following table summarizes the logic for lowering the icmp instruction	52 // The following table summarizes the logic for lowering the icmp instruction

53 // for i32 and narrower types. Each icmp condition has a clear mapping to an	53 // for i32 and narrower types. Each icmp condition has a clear mapping to an

54 // x86 conditional branch instruction.	54 // x86 conditional branch instruction.

55	55

56 const struct TableIcmp32_ {	56 const struct TableIcmp32_ {

57 InstX8632Br::BrCond Mapping;	57 InstX8632::BrCond Mapping;

58 } TableIcmp32[] = {	58 } TableIcmp32[] = {

59 #define X(val, C_32, C1_64, C2_64, C3_64) \	59 #define X(val, C_32, C1_64, C2_64, C3_64) \

60 { InstX8632Br::C_32 } \	60 { InstX8632Br::C_32 } \

61 ,	61 ,

62 ICMPX8632_TABLE	62 ICMPX8632_TABLE

63 #undef X	63 #undef X

64 };	64 };

65 const size_t TableIcmp32Size = llvm::array_lengthof(TableIcmp32);	65 const size_t TableIcmp32Size = llvm::array_lengthof(TableIcmp32);

66	66

67 // The following table summarizes the logic for lowering the icmp instruction	67 // The following table summarizes the logic for lowering the icmp instruction

68 // for the i64 type. For Eq and Ne, two separate 32-bit comparisons and	68 // for the i64 type. For Eq and Ne, two separate 32-bit comparisons and

69 // conditional branches are needed. For the other conditions, three separate	69 // conditional branches are needed. For the other conditions, three separate

70 // conditional branches are needed.	70 // conditional branches are needed.

71 const struct TableIcmp64_ {	71 const struct TableIcmp64_ {

72 InstX8632Br::BrCond C1, C2, C3;	72 InstX8632::BrCond C1, C2, C3;

73 } TableIcmp64[] = {	73 } TableIcmp64[] = {

74 #define X(val, C_32, C1_64, C2_64, C3_64) \	74 #define X(val, C_32, C1_64, C2_64, C3_64) \

75 { InstX8632Br::C1_64, InstX8632Br::C2_64, InstX8632Br::C3_64 } \	75 { InstX8632Br::C1_64, InstX8632Br::C2_64, InstX8632Br::C3_64 } \

76 ,	76 ,

77 ICMPX8632_TABLE	77 ICMPX8632_TABLE

78 #undef X	78 #undef X

79 };	79 };

80 const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64);	80 const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64);

81	81

82 InstX8632Br::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {	82 InstX8632::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {

83 size_t Index = static_cast<size_t>(Cond);	83 size_t Index = static_cast<size_t>(Cond);

84 assert(Index < TableIcmp32Size);	84 assert(Index < TableIcmp32Size);

85 return TableIcmp32[Index].Mapping;	85 return TableIcmp32[Index].Mapping;

86 }	86 }

87	87

88 // The maximum number of arguments to pass in XMM registers	88 // The maximum number of arguments to pass in XMM registers

89 const unsigned X86_MAX_XMM_ARGS = 4;	89 const unsigned X86_MAX_XMM_ARGS = 4;

90	90

91 // In some cases, there are x-macros tables for both high-level and	91 // In some cases, there are x-macros tables for both high-level and

92 // low-level instructions/operands that use the same enum key value.	92 // low-level instructions/operands that use the same enum key value.

(...skipping 2009 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2102 _storeq(T, Addr);	2102 _storeq(T, Addr);

2103 _mfence();	2103 _mfence();

2104 return;	2104 return;

2105 }	2105 }

2106 InstStore *Store = InstStore::create(Func, Value, Ptr);	2106 InstStore *Store = InstStore::create(Func, Value, Ptr);

2107 lowerStore(Store);	2107 lowerStore(Store);

2108 _mfence();	2108 _mfence();

2109 return;	2109 return;

2110 }	2110 }

2111 case Intrinsics::Bswap:	2111 case Intrinsics::Bswap:

2112 case Intrinsics::Ctlz:

2113 case Intrinsics::Ctpop:

2114 case Intrinsics::Cttz:

2115 // TODO(jvoung): fill it in.

2116 Func->setError("Unhandled intrinsic");	2112 Func->setError("Unhandled intrinsic");

2117 return;	2113 return;

	2114 case Intrinsics::Ctpop: {

	2115 Variable *Dest = Instr->getDest();

	2116 Operand *Val = Instr->getArg(0);

	2117 InstCall *Call = makeHelperCall(Val->getType() == IceType_i32 ?
	Jim Stichnoth 2014/07/15 22:08:09 Here and below, I'd prefer to make type comparison Here and below, I'd prefer to make type comparison logic based on comparisons to IceType_i64 instead of IceType_i32, which make it easier to identify the 64-bit lowering special cases that may no longer be necessary for x86-64. There are currently some IceType_i32 comparisons, but usually to distinguish i32 from i8/16 e.g. for mov versus movsx. jvoung (off chromium) 2014/07/15 23:10:33 Done. Yep, would be good to identify 64-bit speci Show quoted text On 2014/07/15 22:08:09, stichnot wrote: > Here and below, I'd prefer to make type comparison logic based on comparisons to > IceType_i64 instead of IceType_i32, which make it easier to identify the 64-bit > lowering special cases that may no longer be necessary for x86-64. There are > currently some IceType_i32 comparisons, but usually to distinguish i32 from > i8/16 e.g. for mov versus movsx. Done. Yep, would be good to identify 64-bit special cases. As we discussed, for this case, for X86-64 we'll still should still decide between the different helper functions, but below there were some cases of deciding hi/lo operand that would be different for x86-64.
	2118 "__popcountsi2" : "__popcountdi2", Dest, 1);

	2119 Call->addArg(Val);

	2120 lowerCall(Call);

	2121 // The popcount helpers always return 32-bit values, while the intrinsic's

	2122 // signature matches the native POPCNT instruction and fills a 64-bit reg

	2123 // (in 64-bit mode). Thus, clear the upper bits of the dest just in case

	2124 // the user doesn't do that in the IR. If the user does that in the IR,

	2125 // then this zero'ing instruction is dead and gets optimized out.

	2126 if (Val->getType() == IceType_i64) {

	2127 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));

	2128 Constant *Zero = Ctx->getConstantZero(IceType_i32);

	2129 _mov(DestHi, Zero);

	2130 }

	2131 return;

	2132 }

	2133 case Intrinsics::Ctlz: {

	2134 // The "is zero undef" parameter is ignored and we always return

	2135 // a well-defined value.

	2136 const bool IsCttz = false;

	2137 Operand *Val = legalize(Instr->getArg(0));

	2138 Operand *FirstVal;

	2139 Operand *SecondVal = NULL;

	2140 if (Val->getType() == IceType_i32) {

	2141 FirstVal = Val;

	2142 } else {

	2143 FirstVal = loOperand(Val);

	2144 SecondVal = hiOperand(Val);

	2145 }

	2146 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, SecondVa l);
	Jim Stichnoth 2014/07/15 22:08:09 80-col 80-col jvoung (off chromium) 2014/07/15 23:10:33 Done. Show quoted text On 2014/07/15 22:08:09, stichnot wrote: > 80-col Done.
	2147 return;

	2148 }

	2149 case Intrinsics::Cttz: {

	2150 // The "is zero undef" parameter is ignored and we always return

	2151 // a well-defined value.

	2152 const bool IsCttz = true;

	2153 Operand *Val = legalize(Instr->getArg(0));

	2154 Operand *FirstVal;

	2155 Operand *SecondVal = NULL;

	2156 if (Val->getType() == IceType_i32) {

	2157 FirstVal = Val;

	2158 } else {

	2159 FirstVal = hiOperand(Val);

	2160 SecondVal = loOperand(Val);

	2161 }

	2162 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal, SecondVa l);
	Jim Stichnoth 2014/07/15 22:08:09 80-col 80-col jvoung (off chromium) 2014/07/15 23:10:33 Done. Show quoted text On 2014/07/15 22:08:09, stichnot wrote: > 80-col Done.
	2163 return;

	2164 }

2118 case Intrinsics::Longjmp: {	2165 case Intrinsics::Longjmp: {

2119 InstCall *Call = makeHelperCall("longjmp", NULL, 2);	2166 InstCall *Call = makeHelperCall("longjmp", NULL, 2);

2120 Call->addArg(Instr->getArg(0));	2167 Call->addArg(Instr->getArg(0));

2121 Call->addArg(Instr->getArg(1));	2168 Call->addArg(Instr->getArg(1));

2122 lowerCall(Call);	2169 lowerCall(Call);

2123 return;	2170 return;

2124 }	2171 }

2125 case Intrinsics::Memcpy: {	2172 case Intrinsics::Memcpy: {

2126 // In the future, we could potentially emit an inline memcpy/memset, etc.	2173 // In the future, we could potentially emit an inline memcpy/memset, etc.

2127 // for intrinsic calls w/ a known length.	2174 // for intrinsic calls w/ a known length.

(...skipping 273 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2401 // If Val is a variable, model the extended live range of Val through	2448 // If Val is a variable, model the extended live range of Val through

2402 // the end of the loop, since it will be re-used by the loop.	2449 // the end of the loop, since it will be re-used by the loop.

2403 if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {	2450 if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {

2404 Context.insert(InstFakeUse::create(Func, ValVar));	2451 Context.insert(InstFakeUse::create(Func, ValVar));

2405 }	2452 }

2406 // The address base is also reused in the loop.	2453 // The address base is also reused in the loop.

2407 Context.insert(InstFakeUse::create(Func, Addr->getBase()));	2454 Context.insert(InstFakeUse::create(Func, Addr->getBase()));

2408 _mov(Dest, T_eax);	2455 _mov(Dest, T_eax);

2409 }	2456 }

2410	2457

	2458 // Lowers count {trailing, leading} zeros intrinsic.

	2459 //

	2460 // We could do constant folding here, but that should have

	2461 // been done by the front-end/middle-end optimizations.

	2462 void TargetX8632::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,

	2463 Operand FirstVal, Operand SecondVal) {

	2464 // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).

	2465 // Then the instructions will handle the Val == 0 case much more simply

	2466 // and won't require conversion from bit position to number of zeros.

	2467 //

	2468 // Otherwise:

	2469 // bsr IF_NOT_ZERO, Val

	2470 // mov T_DEST, 63

	2471 // cmovne T_DEST, IF_NOT_ZERO

	2472 // xor T_DEST, 31

	2473 // mov DEST, T_DEST

	2474 //

	2475 // NOTE: T_DEST must be a register because cmov requires its dest to be a

	2476 // register. Also, bsf and bsr require their dest to be a register.

	2477 //

	2478 // The xor DEST, 31 converts a bit position to # of leading zeroes.

	2479 // E.g., for 000... 00001100, bsr will say that the most significant bit

	2480 // set is at position 3, while the number of leading zeros is 28. Xor is

	2481 // like (31 - N) for N <= 31, and converts 63 to 32 (for the all-zeros case).

	2482 //

	2483 // Similar for 64-bit, but start w/ speculating that the upper 32 bits

	2484 // are all zero, and compute the result for that case (checking the lower

	2485 // 32 bits). Then actually compute the result for the upper bits and

	2486 // cmov in the result from the lower computation if the earlier speculation

	2487 // was correct.

	2488 //

	2489 // Cttz, is similar, but uses bsf instead, and doesn't require the xor

	2490 // bit position conversion, and the speculation is reversed.

	2491 assert(Ty == IceType_i32 \|\| Ty == IceType_i64);

	2492 Variable *T = makeReg(IceType_i32);

	2493 if (Cttz) {

	2494 _bsf(T, FirstVal);

	2495 } else {

	2496 _bsr(T, FirstVal);

	2497 }

	2498 Variable *T_Dest = makeReg(IceType_i32);

	2499 Constant *ThirtyTwo = Ctx->getConstantInt(IceType_i32, 32);

	2500 Constant *ThirtyOne = Ctx->getConstantInt(IceType_i32, 31);

	2501 if (Cttz) {

	2502 _mov(T_Dest, ThirtyTwo);

	2503 } else {

	2504 Constant *SixtyThree = Ctx->getConstantInt(IceType_i32, 63);

	2505 _mov(T_Dest, SixtyThree);

	2506 }

	2507 _cmov(T_Dest, T, InstX8632::Br_ne);

	2508 if (!Cttz) {

	2509 _xor(T_Dest, ThirtyOne);

	2510 }

	2511 if (Ty == IceType_i32) {

	2512 _mov(Dest, T_Dest);

	2513 return;

	2514 }

	2515 _add(T_Dest, ThirtyTwo);

	2516 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));

	2517 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));

	2518 // Will be using "test" on this, so we need a registerized variable.

	2519 Variable *SecondVar = legalizeToVar(SecondVal);

	2520 Variable *T_Dest2 = makeReg(IceType_i32);

	2521 if (Cttz) {

	2522 _bsf(T_Dest2, SecondVar);

	2523 } else {

	2524 _bsr(T_Dest2, SecondVar);

	2525 _xor(T_Dest2, ThirtyOne);

	2526 }

	2527 _test(SecondVar, SecondVar);

	2528 _cmov(T_Dest2, T_Dest, InstX8632::Br_e);

	2529 _mov(DestLo, T_Dest2);

	2530 _mov(DestHi, Ctx->getConstantZero(IceType_i32));

	2531 }

	2532

2411 namespace {	2533 namespace {

2412	2534

2413 bool isAdd(const Inst *Inst) {	2535 bool isAdd(const Inst *Inst) {

2414 if (const InstArithmetic *Arith =	2536 if (const InstArithmetic *Arith =

2415 llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) {	2537 llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) {

2416 return (Arith->getOp() == InstArithmetic::Add);	2538 return (Arith->getOp() == InstArithmetic::Add);

2417 }	2539 }

2418 return false;	2540 return false;

2419 }	2541 }

2420	2542

(...skipping 654 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3075 for (SizeT i = 0; i < Size; ++i) {	3197 for (SizeT i = 0; i < Size; ++i) {

3076 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n";	3198 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n";

3077 }	3199 }

3078 Str << "\t.size\t" << MangledName << ", " << Size << "\n";	3200 Str << "\t.size\t" << MangledName << ", " << Size << "\n";

3079 }	3201 }

3080 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName	3202 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName

3081 << "\n";	3203 << "\n";

3082 }	3204 }

3083	3205

3084 } // end of namespace Ice	3206 } // end of namespace Ice

OLD	NEW

« no previous file with comments | « src/IceTargetLoweringX8632.h ('k') | tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll » ('j') | no next file with comments »