| OLD | NEW |
| 1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// | 1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// |
| 2 // | 2 // |
| 3 // The Subzero Code Generator | 3 // The Subzero Code Generator |
| 4 // | 4 // |
| 5 // This file is distributed under the University of Illinois Open Source | 5 // This file is distributed under the University of Illinois Open Source |
| 6 // License. See LICENSE.TXT for details. | 6 // License. See LICENSE.TXT for details. |
| 7 // | 7 // |
| 8 //===----------------------------------------------------------------------===// | 8 //===----------------------------------------------------------------------===// |
| 9 /// | 9 /// |
| 10 /// \file | 10 /// \file |
| (...skipping 3012 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3023 return; | 3023 return; |
| 3024 } | 3024 } |
| 3025 case Intrinsics::Longjmp: { | 3025 case Intrinsics::Longjmp: { |
| 3026 InstCall *Call = makeHelperCall(H_call_longjmp, nullptr, 2); | 3026 InstCall *Call = makeHelperCall(H_call_longjmp, nullptr, 2); |
| 3027 Call->addArg(Instr->getArg(0)); | 3027 Call->addArg(Instr->getArg(0)); |
| 3028 Call->addArg(Instr->getArg(1)); | 3028 Call->addArg(Instr->getArg(1)); |
| 3029 lowerCall(Call); | 3029 lowerCall(Call); |
| 3030 return; | 3030 return; |
| 3031 } | 3031 } |
| 3032 case Intrinsics::Memcpy: { | 3032 case Intrinsics::Memcpy: { |
| 3033 // In the future, we could potentially emit an inline memcpy/memset, etc. | 3033 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); |
| 3034 // for intrinsic calls w/ a known length. | |
| 3035 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3); | |
| 3036 Call->addArg(Instr->getArg(0)); | |
| 3037 Call->addArg(Instr->getArg(1)); | |
| 3038 Call->addArg(Instr->getArg(2)); | |
| 3039 lowerCall(Call); | |
| 3040 return; | 3034 return; |
| 3041 } | 3035 } |
| 3042 case Intrinsics::Memmove: { | 3036 case Intrinsics::Memmove: { |
| 3043 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3); | 3037 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3); |
| 3044 Call->addArg(Instr->getArg(0)); | 3038 Call->addArg(Instr->getArg(0)); |
| 3045 Call->addArg(Instr->getArg(1)); | 3039 Call->addArg(Instr->getArg(1)); |
| 3046 Call->addArg(Instr->getArg(2)); | 3040 Call->addArg(Instr->getArg(2)); |
| 3047 lowerCall(Call); | 3041 lowerCall(Call); |
| 3048 return; | 3042 return; |
| 3049 } | 3043 } |
| (...skipping 430 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3480 _bsr(T_Dest2, SecondVar); | 3474 _bsr(T_Dest2, SecondVar); |
| 3481 _xor(T_Dest2, ThirtyOne); | 3475 _xor(T_Dest2, ThirtyOne); |
| 3482 } | 3476 } |
| 3483 _test(SecondVar, SecondVar); | 3477 _test(SecondVar, SecondVar); |
| 3484 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e); | 3478 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e); |
| 3485 _mov(DestLo, T_Dest2); | 3479 _mov(DestLo, T_Dest2); |
| 3486 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); | 3480 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); |
| 3487 } | 3481 } |
| 3488 | 3482 |
| 3489 template <class Machine> | 3483 template <class Machine> |
| 3484 void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src, |
| 3485 Operand *Count) { |
| 3486 // There is a load and store for each chunk in the unroll |
| 3487 constexpr uint32_t UNROLL_LIMIT = 8; |
| 3488 constexpr uint32_t BytesPerStorep = 16; |
| 3489 constexpr uint32_t BytesPerStoreq = 8; |
| 3490 constexpr uint32_t BytesPerStorei32 = 4; |
| 3491 constexpr uint32_t BytesPerStorei16 = 2; |
| 3492 constexpr uint32_t BytesPerStorei8 = 1; |
| 3493 |
| 3494 // Check if the operands are constants |
| 3495 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); |
| 3496 const bool IsCountConst = CountConst != nullptr; |
| 3497 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; |
| 3498 |
| 3499 if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) { |
| 3500 // Unlikely, but nothing to do if it does happen |
| 3501 if (CountValue == 0) |
| 3502 return; |
| 3503 |
| 3504 Variable *SrcBase = legalizeToReg(Src); |
| 3505 Variable *DestBase = legalizeToReg(Dest); |
| 3506 |
| 3507 auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) { |
| 3508 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; |
| 3509 // TODO(ascull): this or add nullptr test to _movp, _movq |
| 3510 Variable *Data = makeReg(Ty); |
| 3511 |
| 3512 // TODO(ascull): is 64-bit better with vector or scalar movq? |
| 3513 auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset); |
| 3514 if (isVectorType(Ty)) |
| 3515 _movp(Data, SrcMem); |
| 3516 else if (Ty == IceType_f64) |
| 3517 _movq(Data, SrcMem); |
| 3518 else |
| 3519 _mov(Data, SrcMem); |
| 3520 |
| 3521 auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset); |
| 3522 if (isVectorType(Ty)) |
| 3523 _storep(Data, DestMem); |
| 3524 else if (Ty == IceType_f64) |
| 3525 _storeq(Data, DestMem); |
| 3526 else |
| 3527 _store(Data, DestMem); |
| 3528 }; |
| 3529 |
| 3530 // Lowers the assignment to the remaining bytes. Assumes the original size |
| 3531 // was large enough to allow for overlaps. |
| 3532 auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) { |
| 3533 if (Size > BytesPerStoreq) { |
| 3534 lowerCopy(IceType_v16i8, CountValue - BytesPerStorep); |
| 3535 } else if (Size > BytesPerStorei32) { |
| 3536 lowerCopy(IceType_f64, CountValue - BytesPerStoreq); |
| 3537 } else if (Size > BytesPerStorei16) { |
| 3538 lowerCopy(IceType_i32, CountValue - BytesPerStorei32); |
| 3539 } else if (Size > BytesPerStorei8) { |
| 3540 lowerCopy(IceType_i16, CountValue - BytesPerStorei16); |
| 3541 } else if (Size == BytesPerStorei8) { |
| 3542 lowerCopy(IceType_i8, CountValue - BytesPerStorei8); |
| 3543 } |
| 3544 }; |
| 3545 |
| 3546 if (CountValue >= BytesPerStorep) { |
| 3547 // Use large vector operations |
| 3548 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) { |
| 3549 N -= BytesPerStorep; |
| 3550 lowerCopy(IceType_v16i8, N); |
| 3551 } |
| 3552 lowerLeftOvers(CountValue & 0xF); |
| 3553 return; |
| 3554 } |
| 3555 |
| 3556 // Too small to use large vector operations so use small ones instead |
| 3557 if (CountValue >= BytesPerStoreq) { |
| 3558 lowerCopy(IceType_f64, 0); |
| 3559 lowerLeftOvers(CountValue - BytesPerStoreq); |
| 3560 return; |
| 3561 } |
| 3562 |
| 3563 // Too small for vector operations so use scalar ones |
| 3564 if (CountValue >= BytesPerStorei32) { |
| 3565 lowerCopy(IceType_i32, 0); |
| 3566 lowerLeftOvers(CountValue - BytesPerStorei32); |
| 3567 return; |
| 3568 } |
| 3569 |
| 3570 // 3 is the awkward size as it is too small for the vector or 32-bit |
| 3571 // operations and will not work with lowerLeftOvers as there is no valid |
| 3572 // overlap. |
| 3573 if (CountValue == 3) { |
| 3574 lowerCopy(IceType_i16, 0); |
| 3575 lowerCopy(IceType_i8, 2); |
| 3576 return; |
| 3577 } |
| 3578 |
| 3579 // 1 or 2 can be done in a single scalar copy |
| 3580 lowerLeftOvers(CountValue); |
| 3581 return; |
| 3582 } |
| 3583 |
| 3584 // Fall back on a function call |
| 3585 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3); |
| 3586 Call->addArg(Dest); |
| 3587 Call->addArg(Src); |
| 3588 Call->addArg(Count); |
| 3589 lowerCall(Call); |
| 3590 } |
| 3591 |
| 3592 template <class Machine> |
| 3490 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, | 3593 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, |
| 3491 Operand *Count) { | 3594 Operand *Count) { |
| 3492 constexpr uint32_t UNROLL_LIMIT = 16; | 3595 constexpr uint32_t UNROLL_LIMIT = 16; |
| 3596 constexpr uint32_t BytesPerStorep = 16; |
| 3597 constexpr uint32_t BytesPerStoreq = 8; |
| 3598 constexpr uint32_t BytesPerStorei32 = 4; |
| 3599 constexpr uint32_t BytesPerStorei16 = 2; |
| 3600 constexpr uint32_t BytesPerStorei8 = 1; |
| 3493 assert(Val->getType() == IceType_i8); | 3601 assert(Val->getType() == IceType_i8); |
| 3494 | 3602 |
| 3495 // Check if the operands are constants | 3603 // Check if the operands are constants |
| 3496 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); | 3604 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); |
| 3497 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val); | 3605 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val); |
| 3498 const bool IsCountConst = CountConst != nullptr; | 3606 const bool IsCountConst = CountConst != nullptr; |
| 3499 const bool IsValConst = ValConst != nullptr; | 3607 const bool IsValConst = ValConst != nullptr; |
| 3500 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; | 3608 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; |
| 3501 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0; | 3609 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0; |
| 3502 | 3610 |
| 3503 // Unlikely, but nothing to do if it does happen | 3611 // Unlikely, but nothing to do if it does happen |
| 3504 if (IsCountConst && CountValue == 0) | 3612 if (IsCountConst && CountValue == 0) |
| 3505 return; | 3613 return; |
| 3506 | 3614 |
| 3507 // TODO(ascull): if the count is constant but val is not it would be possible | 3615 // TODO(ascull): if the count is constant but val is not it would be possible |
| 3508 // to inline by spreading the value across 4 bytes and accessing subregs e.g. | 3616 // to inline by spreading the value across 4 bytes and accessing subregs e.g. |
| 3509 // eax, ax and al. | 3617 // eax, ax and al. |
| 3510 if (IsCountConst && IsValConst) { | 3618 if (IsCountConst && IsValConst) { |
| 3511 Variable *Base = legalizeToReg(Dest); | 3619 Variable *Base = nullptr; |
| 3512 // Add a FakeUse in case Base is ultimately not used, e.g. it falls back to | 3620 const uint32_t SpreadValue = |
| 3513 // calling memset(). Otherwise Om1 register allocation fails because this | 3621 (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue; |
| 3514 // infinite-weight variable has a definition but no uses. | 3622 Variable *VecReg = nullptr; |
| 3515 Context.insert(InstFakeUse::create(Func, Base)); | |
| 3516 | 3623 |
| 3517 // 3 is the awkward size as it is too small for the vector or 32-bit | 3624 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty, |
| 3518 // operations and will not work with lowerLeftOvers as there is no valid | 3625 uint32_t OffsetAmt) { |
| 3519 // overlap. | 3626 assert(Base != nullptr); |
| 3520 if (CountValue == 3) { | 3627 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; |
| 3521 Constant *Offset = nullptr; | |
| 3522 auto *Mem = | |
| 3523 Traits::X86OperandMem::create(Func, IceType_i16, Base, Offset); | |
| 3524 _store(Ctx->getConstantInt16((ValValue << 8) | ValValue), Mem); | |
| 3525 | 3628 |
| 3526 Offset = Ctx->getConstantInt8(2); | 3629 // TODO(ascull): is 64-bit better with vector or scalar movq? |
| 3527 Mem = Traits::X86OperandMem::create(Func, IceType_i8, Base, Offset); | 3630 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); |
| 3528 _store(Ctx->getConstantInt8(ValValue), Mem); | 3631 if (isVectorType(Ty)) { |
| 3529 return; | 3632 assert(VecReg != nullptr); |
| 3530 } | 3633 _storep(VecReg, Mem); |
| 3634 } else if (Ty == IceType_i64) { |
| 3635 assert(VecReg != nullptr); |
| 3636 _storeq(VecReg, Mem); |
| 3637 } else { |
| 3638 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem); |
| 3639 } |
| 3640 }; |
| 3531 | 3641 |
| 3532 // Lowers the assignment to the remaining bytes. Assumes the original size | 3642 // Lowers the assignment to the remaining bytes. Assumes the original size |
| 3533 // was large enough to allow for overlaps. | 3643 // was large enough to allow for overlaps. |
| 3534 auto lowerLeftOvers = [this, Base, CountValue]( | 3644 auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) { |
| 3535 uint32_t SpreadValue, uint32_t Size, Variable *VecReg) { | 3645 if (Size > BytesPerStoreq) { |
| 3536 auto lowerStoreSpreadValue = | 3646 lowerSet(IceType_v16i8, CountValue - BytesPerStorep); |
| 3537 [this, Base, CountValue, SpreadValue](Type Ty) { | 3647 } else if (Size > BytesPerStorei32) { |
| 3538 Constant *Offset = | 3648 lowerSet(IceType_i64, CountValue - BytesPerStoreq); |
| 3539 Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty)); | 3649 } else if (Size > BytesPerStorei16) { |
| 3540 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); | 3650 lowerSet(IceType_i32, CountValue - BytesPerStorei32); |
| 3541 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem); | 3651 } else if (Size > BytesPerStorei8) { |
| 3542 }; | 3652 lowerSet(IceType_i16, CountValue - BytesPerStorei16); |
| 3543 | 3653 } else if (Size == BytesPerStorei8) { |
| 3544 if (Size > 8) { | 3654 lowerSet(IceType_i8, CountValue - BytesPerStorei8); |
| 3545 assert(VecReg != nullptr); | |
| 3546 Constant *Offset = Ctx->getConstantInt32(CountValue - 16); | |
| 3547 auto *Mem = Traits::X86OperandMem::create(Func, VecReg->getType(), Base, | |
| 3548 Offset); | |
| 3549 _storep(VecReg, Mem); | |
| 3550 } else if (Size > 4) { | |
| 3551 assert(VecReg != nullptr); | |
| 3552 Constant *Offset = Ctx->getConstantInt32(CountValue - 8); | |
| 3553 auto *Mem = | |
| 3554 Traits::X86OperandMem::create(Func, IceType_i64, Base, Offset); | |
| 3555 _storeq(VecReg, Mem); | |
| 3556 } else if (Size > 2) { | |
| 3557 lowerStoreSpreadValue(IceType_i32); | |
| 3558 } else if (Size > 1) { | |
| 3559 lowerStoreSpreadValue(IceType_i16); | |
| 3560 } else if (Size == 1) { | |
| 3561 lowerStoreSpreadValue(IceType_i8); | |
| 3562 } | 3655 } |
| 3563 }; | 3656 }; |
| 3564 | 3657 |
| 3565 // When the value is zero it can be loaded into a register cheaply using | 3658 // When the value is zero it can be loaded into a vector register cheaply |
| 3566 // the xor trick. | 3659 // using the xor trick. |
| 3567 constexpr uint32_t BytesPerStorep = 16; | 3660 if (ValValue == 0 && CountValue >= BytesPerStoreq && |
| 3568 if (ValValue == 0 && CountValue >= 8 && | |
| 3569 CountValue <= BytesPerStorep * UNROLL_LIMIT) { | 3661 CountValue <= BytesPerStorep * UNROLL_LIMIT) { |
| 3570 Variable *Zero = makeVectorOfZeros(IceType_v16i8); | 3662 Base = legalizeToReg(Dest); |
| 3663 VecReg = makeVectorOfZeros(IceType_v16i8); |
| 3571 | 3664 |
| 3572 // Too small to use large vector operations so use small ones instead | 3665 // Too small to use large vector operations so use small ones instead |
| 3573 if (CountValue < 16) { | 3666 if (CountValue < BytesPerStorep) { |
| 3574 Constant *Offset = nullptr; | 3667 lowerSet(IceType_i64, 0); |
| 3575 auto *Mem = | 3668 lowerLeftOvers(CountValue - BytesPerStoreq); |
| 3576 Traits::X86OperandMem::create(Func, IceType_i64, Base, Offset); | |
| 3577 _storeq(Zero, Mem); | |
| 3578 lowerLeftOvers(0, CountValue - 8, Zero); | |
| 3579 return; | 3669 return; |
| 3580 } | 3670 } |
| 3581 | 3671 |
| 3582 assert(CountValue >= 16); | |
| 3583 // Use large vector operations | 3672 // Use large vector operations |
| 3584 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) { | 3673 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) { |
| 3585 N -= 16; | 3674 N -= 16; |
| 3586 Constant *Offset = Ctx->getConstantInt32(N); | 3675 lowerSet(IceType_v16i8, N); |
| 3587 auto *Mem = | |
| 3588 Traits::X86OperandMem::create(Func, Zero->getType(), Base, Offset); | |
| 3589 _storep(Zero, Mem); | |
| 3590 } | 3676 } |
| 3591 uint32_t LeftOver = CountValue & 0xF; | 3677 lowerLeftOvers(CountValue & 0xF); |
| 3592 lowerLeftOvers(0, LeftOver, Zero); | |
| 3593 return; | 3678 return; |
| 3594 } | 3679 } |
| 3595 | 3680 |
| 3596 // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al? | 3681 // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al? |
| 3597 constexpr uint32_t BytesPerStore = 4; | 3682 if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) { |
| 3598 if (CountValue <= BytesPerStore * UNROLL_LIMIT) { | 3683 Base = legalizeToReg(Dest); |
| 3684 // 3 is the awkward size as it is too small for the vector or 32-bit |
| 3685 // operations and will not work with lowerLeftOvers as there is no valid |
| 3686 // overlap. |
| 3687 if (CountValue == 3) { |
| 3688 lowerSet(IceType_i16, 0); |
| 3689 lowerSet(IceType_i8, 2); |
| 3690 return; |
| 3691 } |
| 3692 |
| 3599 // TODO(ascull); 64-bit can do better with 64-bit mov | 3693 // TODO(ascull); 64-bit can do better with 64-bit mov |
| 3600 uint32_t SpreadValue = | 3694 for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) { |
| 3601 (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue; | 3695 N -= 4; |
| 3602 if (CountValue >= 4) { | 3696 lowerSet(IceType_i32, N); |
| 3603 Constant *ValueConst = Ctx->getConstantInt32(SpreadValue); | |
| 3604 for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) { | |
| 3605 N -= 4; | |
| 3606 Constant *Offset = Ctx->getConstantInt32(N); | |
| 3607 auto *Mem = | |
| 3608 Traits::X86OperandMem::create(Func, IceType_i32, Base, Offset); | |
| 3609 _store(ValueConst, Mem); | |
| 3610 } | |
| 3611 } | 3697 } |
| 3612 uint32_t LeftOver = CountValue & 0x3; | 3698 lowerLeftOvers(CountValue & 0x3); |
| 3613 lowerLeftOvers(SpreadValue, LeftOver, nullptr); | |
| 3614 return; | 3699 return; |
| 3615 } | 3700 } |
| 3616 } | 3701 } |
| 3617 | 3702 |
| 3618 // Fall back on calling the memset function. The value operand needs to be | 3703 // Fall back on calling the memset function. The value operand needs to be |
| 3619 // extended to a stack slot size because the PNaCl ABI requires arguments to | 3704 // extended to a stack slot size because the PNaCl ABI requires arguments to |
| 3620 // be at least 32 bits wide. | 3705 // be at least 32 bits wide. |
| 3621 Operand *ValExt; | 3706 Operand *ValExt; |
| 3622 if (IsValConst) { | 3707 if (IsValConst) { |
| 3623 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue); | 3708 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue); |
| (...skipping 1453 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5077 } | 5162 } |
| 5078 // the offset is not eligible for blinding or pooling, return the original | 5163 // the offset is not eligible for blinding or pooling, return the original |
| 5079 // mem operand | 5164 // mem operand |
| 5080 return MemOperand; | 5165 return MemOperand; |
| 5081 } | 5166 } |
| 5082 | 5167 |
| 5083 } // end of namespace X86Internal | 5168 } // end of namespace X86Internal |
| 5084 } // end of namespace Ice | 5169 } // end of namespace Ice |
| 5085 | 5170 |
| 5086 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H | 5171 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H |
| OLD | NEW |