src/IceTargetLoweringX8632.cpp - Issue 390443005: Lower bitmanip intrinsics, assuming absence of BMI/SSE4.2 for now.

Side by Side Diff: src/IceTargetLoweringX8632.cpp

Issue 390443005: Lower bitmanip intrinsics, assuming absence of BMI/SSE4.2 for now. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master

Patch Set: stuff Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//	1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//

2 //	2 //

3 // The Subzero Code Generator	3 // The Subzero Code Generator

4 //	4 //

5 // This file is distributed under the University of Illinois Open Source	5 // This file is distributed under the University of Illinois Open Source

6 // License. See LICENSE.TXT for details.	6 // License. See LICENSE.TXT for details.

7 //	7 //

8 //===----------------------------------------------------------------------===//	8 //===----------------------------------------------------------------------===//

9 //	9 //

10 // This file implements the TargetLoweringX8632 class, which	10 // This file implements the TargetLoweringX8632 class, which

(...skipping 21 matching lines...) Expand all Loading...
32 // lowerFcmp() describes the lowering template. In the most general case, there	32 // lowerFcmp() describes the lowering template. In the most general case, there

33 // is a compare followed by two conditional branches, because some fcmp	33 // is a compare followed by two conditional branches, because some fcmp

34 // conditions don't map to a single x86 conditional branch. However, in many	34 // conditions don't map to a single x86 conditional branch. However, in many

35 // cases it is possible to swap the operands in the comparison and have a single	35 // cases it is possible to swap the operands in the comparison and have a single

36 // conditional branch. Since it's quite tedious to validate the table by hand,	36 // conditional branch. Since it's quite tedious to validate the table by hand,

37 // good execution tests are helpful.	37 // good execution tests are helpful.

38	38

39 const struct TableFcmp_ {	39 const struct TableFcmp_ {

40 uint32_t Default;	40 uint32_t Default;

41 bool SwapOperands;	41 bool SwapOperands;

42 InstX8632Br::BrCond C1, C2;	42 InstX8632::BrCond C1, C2;

43 } TableFcmp[] = {	43 } TableFcmp[] = {

44 #define X(val, dflt, swap, C1, C2) \	44 #define X(val, dflt, swap, C1, C2) \

45 { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 } \	45 { dflt, swap, InstX8632Br::C1, InstX8632Br::C2 } \

46 ,	46 ,

47 FCMPX8632_TABLE	47 FCMPX8632_TABLE

48 #undef X	48 #undef X

49 };	49 };

50 const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp);	50 const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp);

51	51

52 // The following table summarizes the logic for lowering the icmp instruction	52 // The following table summarizes the logic for lowering the icmp instruction

53 // for i32 and narrower types. Each icmp condition has a clear mapping to an	53 // for i32 and narrower types. Each icmp condition has a clear mapping to an

54 // x86 conditional branch instruction.	54 // x86 conditional branch instruction.

55	55

56 const struct TableIcmp32_ {	56 const struct TableIcmp32_ {

57 InstX8632Br::BrCond Mapping;	57 InstX8632::BrCond Mapping;

58 } TableIcmp32[] = {	58 } TableIcmp32[] = {

59 #define X(val, C_32, C1_64, C2_64, C3_64) \	59 #define X(val, C_32, C1_64, C2_64, C3_64) \

60 { InstX8632Br::C_32 } \	60 { InstX8632Br::C_32 } \

61 ,	61 ,

62 ICMPX8632_TABLE	62 ICMPX8632_TABLE

63 #undef X	63 #undef X

64 };	64 };

65 const size_t TableIcmp32Size = llvm::array_lengthof(TableIcmp32);	65 const size_t TableIcmp32Size = llvm::array_lengthof(TableIcmp32);

66	66

67 // The following table summarizes the logic for lowering the icmp instruction	67 // The following table summarizes the logic for lowering the icmp instruction

68 // for the i64 type. For Eq and Ne, two separate 32-bit comparisons and	68 // for the i64 type. For Eq and Ne, two separate 32-bit comparisons and

69 // conditional branches are needed. For the other conditions, three separate	69 // conditional branches are needed. For the other conditions, three separate

70 // conditional branches are needed.	70 // conditional branches are needed.

71 const struct TableIcmp64_ {	71 const struct TableIcmp64_ {

72 InstX8632Br::BrCond C1, C2, C3;	72 InstX8632::BrCond C1, C2, C3;

73 } TableIcmp64[] = {	73 } TableIcmp64[] = {

74 #define X(val, C_32, C1_64, C2_64, C3_64) \	74 #define X(val, C_32, C1_64, C2_64, C3_64) \

75 { InstX8632Br::C1_64, InstX8632Br::C2_64, InstX8632Br::C3_64 } \	75 { InstX8632Br::C1_64, InstX8632Br::C2_64, InstX8632Br::C3_64 } \

76 ,	76 ,

77 ICMPX8632_TABLE	77 ICMPX8632_TABLE

78 #undef X	78 #undef X

79 };	79 };

80 const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64);	80 const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64);

81	81

82 InstX8632Br::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {	82 InstX8632::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {

83 size_t Index = static_cast<size_t>(Cond);	83 size_t Index = static_cast<size_t>(Cond);

84 assert(Index < TableIcmp32Size);	84 assert(Index < TableIcmp32Size);

85 return TableIcmp32[Index].Mapping;	85 return TableIcmp32[Index].Mapping;

86 }	86 }

87	87

88 // The maximum number of arguments to pass in XMM registers	88 // The maximum number of arguments to pass in XMM registers

89 const unsigned X86_MAX_XMM_ARGS = 4;	89 const unsigned X86_MAX_XMM_ARGS = 4;

90	90

91 // In some cases, there are x-macros tables for both high-level and	91 // In some cases, there are x-macros tables for both high-level and

92 // low-level instructions/operands that use the same enum key value.	92 // low-level instructions/operands that use the same enum key value.

(...skipping 2008 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2101 _storeq(T, Addr);	2101 _storeq(T, Addr);

2102 _mfence();	2102 _mfence();

2103 return;	2103 return;

2104 }	2104 }

2105 InstStore *Store = InstStore::create(Func, Value, Ptr);	2105 InstStore *Store = InstStore::create(Func, Value, Ptr);

2106 lowerStore(Store);	2106 lowerStore(Store);

2107 _mfence();	2107 _mfence();

2108 return;	2108 return;

2109 }	2109 }

2110 case Intrinsics::Bswap:	2110 case Intrinsics::Bswap:

	2111 Func->setError("Unhandled intrinsic");

	2112 return;

	2113 case Intrinsics::Ctpop: {

	2114 Variable *Dest = Instr->getDest();

	2115 Operand *Val = Instr->getArg(0);

	2116 InstCall *Call = makeHelperCall(Val->getType() == IceType_i32 ?

	2117 "__popcountsi2" : "__popcountdi2", Dest, 1);

	2118 Call->addArg(Val);

	2119 lowerCall(Call);

	2120 // The popcount helpers always return 32-bit values, while the intrinsic's

	2121 // signature matches the native POPCNT instruction and fills a 64-bit reg

	2122 // (in 64-bit mode). Thus, clear the upper bits of the dest just in case

	2123 // the user doesn't do that in the IR. If the user does that in the IR,

	2124 // then this zero'ing instruction is dead and gets optimized out.

	2125 if (Val->getType() == IceType_i64) {

	2126 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));

	2127 Constant *Zero = Ctx->getConstantZero(IceType_i32);

	2128 _mov(DestHi, Zero);

	2129 }

	2130 return;

	2131 }

2111 case Intrinsics::Ctlz:	2132 case Intrinsics::Ctlz:

2112 case Intrinsics::Ctpop:	2133 // The "is zero undef" parameter is ignored and we always return

	2134 // a well-defined value.

	2135 lowerCtlz(Instr->getDest(), Instr->getArg(0));

	2136 return;

2113 case Intrinsics::Cttz:	2137 case Intrinsics::Cttz:

2114 // TODO(jvoung): fill it in.	2138 // The "is zero undef" parameter is ignored and we always return

2115 Func->setError("Unhandled intrinsic");	2139 // a well-defined value.

	2140 lowerCttz(Instr->getDest(), Instr->getArg(0));

2116 return;	2141 return;

2117 case Intrinsics::Longjmp: {	2142 case Intrinsics::Longjmp: {

2118 InstCall *Call = makeHelperCall("longjmp", NULL, 2);	2143 InstCall *Call = makeHelperCall("longjmp", NULL, 2);

2119 Call->addArg(Instr->getArg(0));	2144 Call->addArg(Instr->getArg(0));

2120 Call->addArg(Instr->getArg(1));	2145 Call->addArg(Instr->getArg(1));

2121 lowerCall(Call);	2146 lowerCall(Call);

2122 return;	2147 return;

2123 }	2148 }

2124 case Intrinsics::Memcpy: {	2149 case Intrinsics::Memcpy: {

2125 // In the future, we could potentially emit an inline memcpy/memset, etc.	2150 // In the future, we could potentially emit an inline memcpy/memset, etc.

(...skipping 274 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2400 // If Val is a variable, model the extended live range of Val through	2425 // If Val is a variable, model the extended live range of Val through

2401 // the end of the loop, since it will be re-used by the loop.	2426 // the end of the loop, since it will be re-used by the loop.

2402 if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {	2427 if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {

2403 Context.insert(InstFakeUse::create(Func, ValVar));	2428 Context.insert(InstFakeUse::create(Func, ValVar));

2404 }	2429 }

2405 // The address base is also reused in the loop.	2430 // The address base is also reused in the loop.

2406 Context.insert(InstFakeUse::create(Func, Addr->getBase()));	2431 Context.insert(InstFakeUse::create(Func, Addr->getBase()));

2407 _mov(Dest, T_eax);	2432 _mov(Dest, T_eax);

2408 }	2433 }

2409	2434

	2435 // We could do constant folding here, but that should have

	2436 // been done by the front-end/middle-end optimizations.

	2437 void TargetX8632::lowerCtlz(Variable Dest, Operand Val) {

	2438 // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).

	2439 // Then the instructions will handle the Val == 0 case much more simply

	2440 // and won't require conversion from bit position to number of zeros.

	2441 //

	2442 // Otherwise:

	2443 // bsr IF_NOT_ZERO, Val

	2444 // mov DEST, 63

	2445 // cmovne DEST, IF_NOT_ZERO
	Jim Stichnoth 2014/07/14 23:20:45 Might want to document that the cmov instruction r Might want to document that the cmov instruction requires Dest to be a physical register, hence the use of T_Dest. I'm not sure, but this could be the first instruction in Subzero where that is true and a specific register is not required. jvoung (off chromium) 2014/07/15 21:30:23 Done. I think the cvt instructions and some/most Show quoted text On 2014/07/14 23:20:45, stichnot wrote: > Might want to document that the cmov instruction requires Dest to be a physical > register, hence the use of T_Dest. I'm not sure, but this could be the first > instruction in Subzero where that is true and a specific register is not > required. Done. I think the cvt instructions and some/most of the xmm ones, also require dest to be a register? Actually, looks like bsf and bsr also require the dest to be a register.
	2446 // xor DEST, 31

	2447 //

	2448 // The xor DEST, 31 converts a bit position to # of leading zeroes.

	2449 // E.g., for 000... 00001100, bsr will say that the most significant bit

	2450 // set is at position 3, while the number of leading zeros is 28. Xor is

	2451 // like (31 - N) for N <= 31, and converts 63 to 32 (for the all-zeros case).

	2452 //

	2453 // Similar for 64-bit, but start w/ speculating that the upper 32 bits

	2454 // are all zero, and compute the result for that case (checking the lower

	2455 // 32 bits). Then actually compute the result for the upper bits and

	2456 // cmov in the result from the lower computation if the earlier speculation

	2457 // was correct.

	2458 Type Ty = Val->getType();

	2459 assert(Ty == IceType_i32 \|\| Ty == IceType_i64);

	2460 Val = legalize(Val);

	2461 Operand *FirstVal = Ty == IceType_i32 ? Val : loOperand(Val);

	2462 Variable *T = makeReg(IceType_i32);

	2463 _bsr(T, FirstVal);

	2464 Variable *T_Dest = makeReg(IceType_i32);

	2465 Constant *SixtyThree = Ctx->getConstantInt(IceType_i32, 63);

	2466 Constant *ThirtyOne = Ctx->getConstantInt(IceType_i32, 31);

	2467 _mov(T_Dest, SixtyThree);

	2468 _cmov(T_Dest, T, InstX8632::Br_ne);

	2469 _xor(T_Dest, ThirtyOne);

	2470 if (Ty == IceType_i32) {

	2471 _mov(Dest, T_Dest);

	2472 return;

	2473 }

	2474 Constant *ThirtyTwo = Ctx->getConstantInt(IceType_i32, 32);

	2475 _add(T_Dest, ThirtyTwo);

	2476 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));

	2477 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));

	2478 // Will be using "test" on this, so we need a registerized version.

	2479 Variable *HigherVar = legalizeToVar(hiOperand(Val));

	2480 Variable *T_Dest2 = makeReg(IceType_i32);

	2481 _bsr(T_Dest2, HigherVar);

	2482 _xor(T_Dest2, ThirtyOne);

	2483 _test(HigherVar, HigherVar);

	2484 _cmov(T_Dest2, T_Dest, InstX8632::Br_e);

	2485 _mov(DestLo, T_Dest2);

	2486 _mov(DestHi, Ctx->getConstantZero(IceType_i32));

	2487 }

	2488

	2489 void TargetX8632::lowerCttz(Variable Dest, Operand Val) {

	2490 // TODO(jvoung): Determine if the user CPU supports TZCNT (BMI).
	Jim Stichnoth 2014/07/14 23:20:45 Would it make sense to refactor this and lowerCtlz Would it make sense to refactor this and lowerCtlz()? It looks like there's a lot in common. jvoung (off chromium) 2014/07/15 21:30:23 Hmm, I tried that in the new patch set. There are Show quoted text On 2014/07/14 23:20:45, stichnot wrote: > Would it make sense to refactor this and lowerCtlz()? It looks like there's a > lot in common. Hmm, I tried that in the new patch set. There are still a bunch of if-else because of the xor vs no xor, and the different constants. Otherwise, I might be able to remove one more if-else by passing in the _bsf/_bsr method pointer.
	2491 // Then the instructions will handle the Val == 0 case much more simply.

	2492 //

	2493 // Otherwise:

	2494 // bsf IF_NOT_ZERO, Val

	2495 // mov IF_ZERO, 32

	2496 // cmovne IF_ZERO, IF_NOT_ZERO

	2497 //

	2498 // Similar for 64-bit, but start w/ speculating that the bottom 32 bits

	2499 // are all zero.

	2500 Type Ty = Val->getType();

	2501 assert(Ty == IceType_i32 \|\| Ty == IceType_i64);

	2502 Val = legalize(Val);

	2503 Operand *FirstVal = Ty == IceType_i32 ? Val : hiOperand(Val);

	2504 Variable *T = makeReg(IceType_i32);

	2505 _bsf(T, FirstVal);

	2506 Variable *T_Dest = makeReg(IceType_i32);

	2507 Constant *ThirtyTwo = Ctx->getConstantInt(IceType_i32, 32);

	2508 _mov(T_Dest, ThirtyTwo);

	2509 _cmov(T_Dest, T, InstX8632::Br_ne);

	2510 if (Ty == IceType_i32) {

	2511 _mov(Dest, T_Dest);

	2512 return;

	2513 }

	2514 _add(T_Dest, ThirtyTwo);

	2515 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));

	2516 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));

	2517 // Will be using "test" on this, so we need a registerized version.

	2518 Variable *LowerVar = legalizeToVar(loOperand(Val));

	2519 Variable *T_Dest2 = makeReg(IceType_i32);

	2520 _bsf(T_Dest2, LowerVar);

	2521 _test(LowerVar, LowerVar);

	2522 _cmov(T_Dest2, T_Dest, InstX8632::Br_e);

	2523 _mov(DestLo, T_Dest2);

	2524 _mov(DestHi, Ctx->getConstantZero(IceType_i32));

	2525 }

	2526

2410 namespace {	2527 namespace {

2411	2528

2412 bool isAdd(const Inst *Inst) {	2529 bool isAdd(const Inst *Inst) {

2413 if (const InstArithmetic *Arith =	2530 if (const InstArithmetic *Arith =

2414 llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) {	2531 llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) {

2415 return (Arith->getOp() == InstArithmetic::Add);	2532 return (Arith->getOp() == InstArithmetic::Add);

2416 }	2533 }

2417 return false;	2534 return false;

2418 }	2535 }

2419	2536

(...skipping 649 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3069 for (SizeT i = 0; i < Size; ++i) {	3186 for (SizeT i = 0; i < Size; ++i) {

3070 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n";	3187 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n";

3071 }	3188 }

3072 Str << "\t.size\t" << MangledName << ", " << Size << "\n";	3189 Str << "\t.size\t" << MangledName << ", " << Size << "\n";

3073 }	3190 }

3074 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName	3191 Str << "\t" << (IsInternal ? ".local" : ".global") << "\t" << MangledName

3075 << "\n";	3192 << "\n";

3076 }	3193 }

3077	3194

3078 } // end of namespace Ice	3195 } // end of namespace Ice

OLD	NEW

« src/IceInstX8632.cpp ('K') | « src/IceTargetLoweringX8632.h ('k') | tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll » ('j') | tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll » ('J')