| OLD | NEW |
| 1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// | 1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// |
| 2 // | 2 // |
| 3 // The Subzero Code Generator | 3 // The Subzero Code Generator |
| 4 // | 4 // |
| 5 // This file is distributed under the University of Illinois Open Source | 5 // This file is distributed under the University of Illinois Open Source |
| 6 // License. See LICENSE.TXT for details. | 6 // License. See LICENSE.TXT for details. |
| 7 // | 7 // |
| 8 //===----------------------------------------------------------------------===// | 8 //===----------------------------------------------------------------------===// |
| 9 /// | 9 /// |
| 10 /// \file | 10 /// \file |
| (...skipping 3135 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3146 Call->addArg(Instr->getArg(0)); | 3146 Call->addArg(Instr->getArg(0)); |
| 3147 Call->addArg(Instr->getArg(1)); | 3147 Call->addArg(Instr->getArg(1)); |
| 3148 lowerCall(Call); | 3148 lowerCall(Call); |
| 3149 return; | 3149 return; |
| 3150 } | 3150 } |
| 3151 case Intrinsics::Memcpy: { | 3151 case Intrinsics::Memcpy: { |
| 3152 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); | 3152 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); |
| 3153 return; | 3153 return; |
| 3154 } | 3154 } |
| 3155 case Intrinsics::Memmove: { | 3155 case Intrinsics::Memmove: { |
| 3156 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3); | 3156 lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); |
| 3157 Call->addArg(Instr->getArg(0)); | |
| 3158 Call->addArg(Instr->getArg(1)); | |
| 3159 Call->addArg(Instr->getArg(2)); | |
| 3160 lowerCall(Call); | |
| 3161 return; | 3157 return; |
| 3162 } | 3158 } |
| 3163 case Intrinsics::Memset: { | 3159 case Intrinsics::Memset: { |
| 3164 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); | 3160 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); |
| 3165 return; | 3161 return; |
| 3166 } | 3162 } |
| 3167 case Intrinsics::NaClReadTP: { | 3163 case Intrinsics::NaClReadTP: { |
| 3168 if (Ctx->getFlags().getUseSandboxing()) { | 3164 if (Ctx->getFlags().getUseSandboxing()) { |
| 3169 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand); | 3165 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand); |
| 3170 Variable *Dest = Instr->getDest(); | 3166 Variable *Dest = Instr->getDest(); |
| (...skipping 422 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3593 _bsr(T_Dest2, SecondVar); | 3589 _bsr(T_Dest2, SecondVar); |
| 3594 _xor(T_Dest2, ThirtyOne); | 3590 _xor(T_Dest2, ThirtyOne); |
| 3595 } | 3591 } |
| 3596 _test(SecondVar, SecondVar); | 3592 _test(SecondVar, SecondVar); |
| 3597 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e); | 3593 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e); |
| 3598 _mov(DestLo, T_Dest2); | 3594 _mov(DestLo, T_Dest2); |
| 3599 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); | 3595 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); |
| 3600 } | 3596 } |
| 3601 | 3597 |
| 3602 template <class Machine> | 3598 template <class Machine> |
| 3599 void TargetX86Base<Machine>::typedLoad(Type Ty, Variable *Dest, Variable *Base, |
| 3600 Constant *Offset) { |
| 3601 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); |
| 3602 |
| 3603 if (isVectorType(Ty)) |
| 3604 _movp(Dest, Mem); |
| 3605 else if (Ty == IceType_f64) |
| 3606 _movq(Dest, Mem); |
| 3607 else |
| 3608 _mov(Dest, Mem); |
| 3609 } |
| 3610 |
| 3611 template <class Machine> |
| 3612 void TargetX86Base<Machine>::typedStore(Type Ty, Variable *Value, |
| 3613 Variable *Base, Constant *Offset) { |
| 3614 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); |
| 3615 |
| 3616 if (isVectorType(Ty)) |
| 3617 _storep(Value, Mem); |
| 3618 else if (Ty == IceType_f64) |
| 3619 _storeq(Value, Mem); |
| 3620 else |
| 3621 _store(Value, Mem); |
| 3622 } |
| 3623 |
| 3624 template <class Machine> |
| 3625 void TargetX86Base<Machine>::copyMemory(Type Ty, Variable *Dest, Variable *Src, |
| 3626 int32_t OffsetAmt) { |
| 3627 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; |
| 3628 // TODO(ascull): this or add nullptr test to _movp, _movq |
| 3629 Variable *Data = makeReg(Ty); |
| 3630 |
| 3631 typedLoad(Ty, Data, Src, Offset); |
| 3632 typedStore(Ty, Data, Dest, Offset); |
| 3633 } |
| 3634 |
| 3635 template <class Machine> |
| 3603 void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src, | 3636 void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src, |
| 3604 Operand *Count) { | 3637 Operand *Count) { |
| 3605 // There is a load and store for each chunk in the unroll | 3638 // There is a load and store for each chunk in the unroll |
| 3606 constexpr uint32_t UNROLL_LIMIT = 8; | |
| 3607 constexpr uint32_t BytesPerStorep = 16; | 3639 constexpr uint32_t BytesPerStorep = 16; |
| 3608 constexpr uint32_t BytesPerStoreq = 8; | |
| 3609 constexpr uint32_t BytesPerStorei32 = 4; | |
| 3610 constexpr uint32_t BytesPerStorei16 = 2; | |
| 3611 constexpr uint32_t BytesPerStorei8 = 1; | |
| 3612 | 3640 |
| 3613 // Check if the operands are constants | 3641 // Check if the operands are constants |
| 3614 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); | 3642 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); |
| 3615 const bool IsCountConst = CountConst != nullptr; | 3643 const bool IsCountConst = CountConst != nullptr; |
| 3616 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; | 3644 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; |
| 3617 | 3645 |
| 3618 if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) { | 3646 if (shouldOptimizeMemIntrins() && IsCountConst && |
| 3647 CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) { |
| 3619 // Unlikely, but nothing to do if it does happen | 3648 // Unlikely, but nothing to do if it does happen |
| 3620 if (CountValue == 0) | 3649 if (CountValue == 0) |
| 3621 return; | 3650 return; |
| 3622 | 3651 |
| 3623 Variable *SrcBase = legalizeToReg(Src); | 3652 Variable *SrcBase = legalizeToReg(Src); |
| 3624 Variable *DestBase = legalizeToReg(Dest); | 3653 Variable *DestBase = legalizeToReg(Dest); |
| 3625 | 3654 |
| 3626 auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) { | 3655 // Find the largest type that can be used and use it as much as possible in |
| 3627 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; | 3656 // reverse order. Then handle any remainder with overlapping copies. Since |
| 3628 // TODO(ascull): this or add nullptr test to _movp, _movq | 3657 // the remainder will be at the end, there will be reduced pressure on the |
| 3629 Variable *Data = makeReg(Ty); | 3658 // memory unit as the accesses to the same memory are far apart. |
| 3659 Type Ty = largestTypeInSize(CountValue); |
| 3660 uint32_t TyWidth = typeWidthInBytes(Ty); |
| 3630 | 3661 |
| 3631 // TODO(ascull): is 64-bit better with vector or scalar movq? | 3662 uint32_t RemainingBytes = CountValue; |
| 3632 auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset); | 3663 int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth; |
| 3633 if (isVectorType(Ty)) | 3664 while (RemainingBytes >= TyWidth) { |
| 3634 _movp(Data, SrcMem); | 3665 copyMemory(Ty, DestBase, SrcBase, Offset); |
| 3635 else if (Ty == IceType_f64) | 3666 RemainingBytes -= TyWidth; |
| 3636 _movq(Data, SrcMem); | 3667 Offset -= TyWidth; |
| 3637 else | |
| 3638 _mov(Data, SrcMem); | |
| 3639 | |
| 3640 auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset); | |
| 3641 if (isVectorType(Ty)) | |
| 3642 _storep(Data, DestMem); | |
| 3643 else if (Ty == IceType_f64) | |
| 3644 _storeq(Data, DestMem); | |
| 3645 else | |
| 3646 _store(Data, DestMem); | |
| 3647 }; | |
| 3648 | |
| 3649 // Lowers the assignment to the remaining bytes. Assumes the original size | |
| 3650 // was large enough to allow for overlaps. | |
| 3651 auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) { | |
| 3652 if (Size > BytesPerStoreq) { | |
| 3653 lowerCopy(IceType_v16i8, CountValue - BytesPerStorep); | |
| 3654 } else if (Size > BytesPerStorei32) { | |
| 3655 lowerCopy(IceType_f64, CountValue - BytesPerStoreq); | |
| 3656 } else if (Size > BytesPerStorei16) { | |
| 3657 lowerCopy(IceType_i32, CountValue - BytesPerStorei32); | |
| 3658 } else if (Size > BytesPerStorei8) { | |
| 3659 lowerCopy(IceType_i16, CountValue - BytesPerStorei16); | |
| 3660 } else if (Size == BytesPerStorei8) { | |
| 3661 lowerCopy(IceType_i8, CountValue - BytesPerStorei8); | |
| 3662 } | |
| 3663 }; | |
| 3664 | |
| 3665 if (CountValue >= BytesPerStorep) { | |
| 3666 // Use large vector operations | |
| 3667 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) { | |
| 3668 N -= BytesPerStorep; | |
| 3669 lowerCopy(IceType_v16i8, N); | |
| 3670 } | |
| 3671 lowerLeftOvers(CountValue & 0xF); | |
| 3672 return; | |
| 3673 } | 3668 } |
| 3674 | 3669 |
| 3675 // Too small to use large vector operations so use small ones instead | 3670 if (RemainingBytes == 0) |
| 3676 if (CountValue >= BytesPerStoreq) { | |
| 3677 lowerCopy(IceType_f64, 0); | |
| 3678 lowerLeftOvers(CountValue - BytesPerStoreq); | |
| 3679 return; | 3671 return; |
| 3680 } | |
| 3681 | 3672 |
| 3682 // Too small for vector operations so use scalar ones | 3673 // Lower the remaining bytes. Adjust to larger types in order to make use |
| 3683 if (CountValue >= BytesPerStorei32) { | 3674 // of overlaps in the copies. |
| 3684 lowerCopy(IceType_i32, 0); | 3675 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes); |
| 3685 lowerLeftOvers(CountValue - BytesPerStorei32); | 3676 Offset = CountValue - typeWidthInBytes(LeftOverTy); |
| 3686 return; | 3677 copyMemory(LeftOverTy, DestBase, SrcBase, Offset); |
| 3687 } | |
| 3688 | |
| 3689 // 3 is the awkward size as it is too small for the vector or 32-bit | |
| 3690 // operations and will not work with lowerLeftOvers as there is no valid | |
| 3691 // overlap. | |
| 3692 if (CountValue == 3) { | |
| 3693 lowerCopy(IceType_i16, 0); | |
| 3694 lowerCopy(IceType_i8, 2); | |
| 3695 return; | |
| 3696 } | |
| 3697 | |
| 3698 // 1 or 2 can be done in a single scalar copy | |
| 3699 lowerLeftOvers(CountValue); | |
| 3700 return; | 3678 return; |
| 3701 } | 3679 } |
| 3702 | 3680 |
| 3703 // Fall back on a function call | 3681 // Fall back on a function call |
| 3704 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3); | 3682 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3); |
| 3705 Call->addArg(Dest); | 3683 Call->addArg(Dest); |
| 3706 Call->addArg(Src); | 3684 Call->addArg(Src); |
| 3707 Call->addArg(Count); | 3685 Call->addArg(Count); |
| 3686 lowerCall(Call); |
| 3687 } |
| 3688 |
| 3689 template <class Machine> |
| 3690 void TargetX86Base<Machine>::lowerMemmove(Operand *Dest, Operand *Src, |
| 3691 Operand *Count) { |
| 3692 // There is a load and store for each chunk in the unroll |
| 3693 constexpr uint32_t BytesPerStorep = 16; |
| 3694 |
| 3695 // Check if the operands are constants |
| 3696 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); |
| 3697 const bool IsCountConst = CountConst != nullptr; |
| 3698 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; |
| 3699 |
| 3700 if (shouldOptimizeMemIntrins() && IsCountConst && |
| 3701 CountValue <= BytesPerStorep * Traits::MEMMOVE_UNROLL_LIMIT) { |
| 3702 // Unlikely, but nothing to do if it does happen |
| 3703 if (CountValue == 0) |
| 3704 return; |
| 3705 |
| 3706 Variable *SrcBase = legalizeToReg(Src); |
| 3707 Variable *DestBase = legalizeToReg(Dest); |
| 3708 |
| 3709 std::tuple<Type, Constant *, Variable *> |
| 3710 Moves[Traits::MEMMOVE_UNROLL_LIMIT]; |
| 3711 Constant *Offset; |
| 3712 Variable *Reg; |
| 3713 |
| 3714 // Copy the data into registers as the source and destination could overlap |
| 3715 // so make sure not to clobber the memory. This also means overlapping moves |
| 3716 // can be used as we are taking a safe snapshot of the memory. |
| 3717 Type Ty = largestTypeInSize(CountValue); |
| 3718 uint32_t TyWidth = typeWidthInBytes(Ty); |
| 3719 |
| 3720 uint32_t RemainingBytes = CountValue; |
| 3721 int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth; |
| 3722 size_t N = 0; |
| 3723 while (RemainingBytes >= TyWidth) { |
| 3724 assert(N <= Traits::MEMMOVE_UNROLL_LIMIT); |
| 3725 Offset = Ctx->getConstantInt32(OffsetAmt); |
| 3726 Reg = makeReg(Ty); |
| 3727 typedLoad(Ty, Reg, SrcBase, Offset); |
| 3728 RemainingBytes -= TyWidth; |
| 3729 OffsetAmt -= TyWidth; |
| 3730 Moves[N++] = std::make_tuple(Ty, Offset, Reg); |
| 3731 } |
| 3732 |
| 3733 if (RemainingBytes != 0) { |
| 3734 // Lower the remaining bytes. Adjust to larger types in order to make use |
| 3735 // of overlaps in the copies. |
| 3736 assert(N <= Traits::MEMMOVE_UNROLL_LIMIT); |
| 3737 Ty = firstTypeThatFitsSize(RemainingBytes); |
| 3738 Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty)); |
| 3739 Reg = makeReg(Ty); |
| 3740 typedLoad(Ty, Reg, SrcBase, Offset); |
| 3741 Moves[N++] = std::make_tuple(Ty, Offset, Reg); |
| 3742 } |
| 3743 |
| 3744 // Copy the data out into the destination memory |
| 3745 for (size_t i = 0; i < N; ++i) { |
| 3746 std::tie(Ty, Offset, Reg) = Moves[i]; |
| 3747 typedStore(Ty, Reg, DestBase, Offset); |
| 3748 } |
| 3749 |
| 3750 return; |
| 3751 } |
| 3752 |
| 3753 // Fall back on a function call |
| 3754 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3); |
| 3755 Call->addArg(Dest); |
| 3756 Call->addArg(Src); |
| 3757 Call->addArg(Count); |
| 3708 lowerCall(Call); | 3758 lowerCall(Call); |
| 3709 } | 3759 } |
| 3710 | 3760 |
| 3711 template <class Machine> | 3761 template <class Machine> |
| 3712 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, | 3762 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, |
| 3713 Operand *Count) { | 3763 Operand *Count) { |
| 3714 constexpr uint32_t UNROLL_LIMIT = 16; | |
| 3715 constexpr uint32_t BytesPerStorep = 16; | 3764 constexpr uint32_t BytesPerStorep = 16; |
| 3716 constexpr uint32_t BytesPerStoreq = 8; | 3765 constexpr uint32_t BytesPerStoreq = 8; |
| 3717 constexpr uint32_t BytesPerStorei32 = 4; | 3766 constexpr uint32_t BytesPerStorei32 = 4; |
| 3718 constexpr uint32_t BytesPerStorei16 = 2; | |
| 3719 constexpr uint32_t BytesPerStorei8 = 1; | |
| 3720 assert(Val->getType() == IceType_i8); | 3767 assert(Val->getType() == IceType_i8); |
| 3721 | 3768 |
| 3722 // Check if the operands are constants | 3769 // Check if the operands are constants |
| 3723 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); | 3770 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); |
| 3724 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val); | 3771 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val); |
| 3725 const bool IsCountConst = CountConst != nullptr; | 3772 const bool IsCountConst = CountConst != nullptr; |
| 3726 const bool IsValConst = ValConst != nullptr; | 3773 const bool IsValConst = ValConst != nullptr; |
| 3727 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; | 3774 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; |
| 3728 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0; | 3775 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0; |
| 3729 | 3776 |
| 3730 // Unlikely, but nothing to do if it does happen | 3777 // Unlikely, but nothing to do if it does happen |
| 3731 if (IsCountConst && CountValue == 0) | 3778 if (IsCountConst && CountValue == 0) |
| 3732 return; | 3779 return; |
| 3733 | 3780 |
| 3734 // TODO(ascull): if the count is constant but val is not it would be possible | 3781 // TODO(ascull): if the count is constant but val is not it would be possible |
| 3735 // to inline by spreading the value across 4 bytes and accessing subregs e.g. | 3782 // to inline by spreading the value across 4 bytes and accessing subregs e.g. |
| 3736 // eax, ax and al. | 3783 // eax, ax and al. |
| 3737 if (IsCountConst && IsValConst) { | 3784 if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) { |
| 3738 Variable *Base = nullptr; | 3785 Variable *Base = nullptr; |
| 3786 Variable *VecReg = nullptr; |
| 3739 const uint32_t SpreadValue = | 3787 const uint32_t SpreadValue = |
| 3740 (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue; | 3788 (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue; |
| 3741 Variable *VecReg = nullptr; | |
| 3742 | 3789 |
| 3743 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty, | 3790 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty, |
| 3744 uint32_t OffsetAmt) { | 3791 uint32_t OffsetAmt) { |
| 3745 assert(Base != nullptr); | 3792 assert(Base != nullptr); |
| 3746 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; | 3793 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; |
| 3747 | 3794 |
| 3748 // TODO(ascull): is 64-bit better with vector or scalar movq? | 3795 // TODO(ascull): is 64-bit better with vector or scalar movq? |
| 3749 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); | 3796 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); |
| 3750 if (isVectorType(Ty)) { | 3797 if (isVectorType(Ty)) { |
| 3751 assert(VecReg != nullptr); | 3798 assert(VecReg != nullptr); |
| 3752 _storep(VecReg, Mem); | 3799 _storep(VecReg, Mem); |
| 3753 } else if (Ty == IceType_i64) { | 3800 } else if (Ty == IceType_f64) { |
| 3754 assert(VecReg != nullptr); | 3801 assert(VecReg != nullptr); |
| 3755 _storeq(VecReg, Mem); | 3802 _storeq(VecReg, Mem); |
| 3756 } else { | 3803 } else { |
| 3757 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem); | 3804 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem); |
| 3758 } | 3805 } |
| 3759 }; | 3806 }; |
| 3760 | 3807 |
| 3761 // Lowers the assignment to the remaining bytes. Assumes the original size | 3808 // Find the largest type that can be used and use it as much as possible in |
| 3762 // was large enough to allow for overlaps. | 3809 // reverse order. Then handle any remainder with overlapping copies. Since |
| 3763 auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) { | 3810 // the remainder will be at the end, there will be reduces pressure on the |
| 3764 if (Size > BytesPerStoreq) { | 3811 // memory unit as the access to the same memory are far apart. |
| 3765 lowerSet(IceType_v16i8, CountValue - BytesPerStorep); | 3812 Type Ty; |
| 3766 } else if (Size > BytesPerStorei32) { | |
| 3767 lowerSet(IceType_i64, CountValue - BytesPerStoreq); | |
| 3768 } else if (Size > BytesPerStorei16) { | |
| 3769 lowerSet(IceType_i32, CountValue - BytesPerStorei32); | |
| 3770 } else if (Size > BytesPerStorei8) { | |
| 3771 lowerSet(IceType_i16, CountValue - BytesPerStorei16); | |
| 3772 } else if (Size == BytesPerStorei8) { | |
| 3773 lowerSet(IceType_i8, CountValue - BytesPerStorei8); | |
| 3774 } | |
| 3775 }; | |
| 3776 | |
| 3777 // When the value is zero it can be loaded into a vector register cheaply | |
| 3778 // using the xor trick. | |
| 3779 if (ValValue == 0 && CountValue >= BytesPerStoreq && | 3813 if (ValValue == 0 && CountValue >= BytesPerStoreq && |
| 3780 CountValue <= BytesPerStorep * UNROLL_LIMIT) { | 3814 CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) { |
| 3815 // When the value is zero it can be loaded into a vector register cheaply |
| 3816 // using the xor trick. |
| 3781 Base = legalizeToReg(Dest); | 3817 Base = legalizeToReg(Dest); |
| 3782 VecReg = makeVectorOfZeros(IceType_v16i8); | 3818 VecReg = makeVectorOfZeros(IceType_v16i8); |
| 3819 Ty = largestTypeInSize(CountValue); |
| 3820 } else if (CountValue <= BytesPerStorei32 * Traits::MEMCPY_UNROLL_LIMIT) { |
| 3821 // When the value is non-zero or the count is small we can't use vector |
| 3822 // instructions so are limited to 32-bit stores. |
| 3823 Base = legalizeToReg(Dest); |
| 3824 constexpr uint32_t MaxSize = 4; |
| 3825 Ty = largestTypeInSize(CountValue, MaxSize); |
| 3826 } |
| 3783 | 3827 |
| 3784 // Too small to use large vector operations so use small ones instead | 3828 if (Base) { |
| 3785 if (CountValue < BytesPerStorep) { | 3829 uint32_t TyWidth = typeWidthInBytes(Ty); |
| 3786 lowerSet(IceType_i64, 0); | 3830 |
| 3787 lowerLeftOvers(CountValue - BytesPerStoreq); | 3831 uint32_t RemainingBytes = CountValue; |
| 3788 return; | 3832 uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth; |
| 3833 while (RemainingBytes >= TyWidth) { |
| 3834 lowerSet(Ty, Offset); |
| 3835 RemainingBytes -= TyWidth; |
| 3836 Offset -= TyWidth; |
| 3789 } | 3837 } |
| 3790 | 3838 |
| 3791 // Use large vector operations | 3839 if (RemainingBytes == 0) |
| 3792 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) { | 3840 return; |
| 3793 N -= 16; | |
| 3794 lowerSet(IceType_v16i8, N); | |
| 3795 } | |
| 3796 lowerLeftOvers(CountValue & 0xF); | |
| 3797 return; | |
| 3798 } | |
| 3799 | 3841 |
| 3800 // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al? | 3842 // Lower the remaining bytes. Adjust to larger types in order to make use |
| 3801 if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) { | 3843 // of overlaps in the copies. |
| 3802 Base = legalizeToReg(Dest); | 3844 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes); |
| 3803 // 3 is the awkward size as it is too small for the vector or 32-bit | 3845 Offset = CountValue - typeWidthInBytes(LeftOverTy); |
| 3804 // operations and will not work with lowerLeftOvers as there is no valid | 3846 lowerSet(LeftOverTy, Offset); |
| 3805 // overlap. | |
| 3806 if (CountValue == 3) { | |
| 3807 lowerSet(IceType_i16, 0); | |
| 3808 lowerSet(IceType_i8, 2); | |
| 3809 return; | |
| 3810 } | |
| 3811 | |
| 3812 // TODO(ascull); 64-bit can do better with 64-bit mov | |
| 3813 for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) { | |
| 3814 N -= 4; | |
| 3815 lowerSet(IceType_i32, N); | |
| 3816 } | |
| 3817 lowerLeftOvers(CountValue & 0x3); | |
| 3818 return; | 3847 return; |
| 3819 } | 3848 } |
| 3820 } | 3849 } |
| 3821 | 3850 |
| 3822 // Fall back on calling the memset function. The value operand needs to be | 3851 // Fall back on calling the memset function. The value operand needs to be |
| 3823 // extended to a stack slot size because the PNaCl ABI requires arguments to | 3852 // extended to a stack slot size because the PNaCl ABI requires arguments to |
| 3824 // be at least 32 bits wide. | 3853 // be at least 32 bits wide. |
| 3825 Operand *ValExt; | 3854 Operand *ValExt; |
| 3826 if (IsValConst) { | 3855 if (IsValConst) { |
| 3827 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue); | 3856 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue); |
| (...skipping 1218 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5046 // There aren't any 64-bit integer registers for x86-32. | 5075 // There aren't any 64-bit integer registers for x86-32. |
| 5047 assert(Traits::Is64Bit || Type != IceType_i64); | 5076 assert(Traits::Is64Bit || Type != IceType_i64); |
| 5048 Variable *Reg = Func->makeVariable(Type); | 5077 Variable *Reg = Func->makeVariable(Type); |
| 5049 if (RegNum == Variable::NoRegister) | 5078 if (RegNum == Variable::NoRegister) |
| 5050 Reg->setWeightInfinite(); | 5079 Reg->setWeightInfinite(); |
| 5051 else | 5080 else |
| 5052 Reg->setRegNum(RegNum); | 5081 Reg->setRegNum(RegNum); |
| 5053 return Reg; | 5082 return Reg; |
| 5054 } | 5083 } |
| 5055 | 5084 |
| 5085 template <class Machine> |
| 5086 const Type TargetX86Base<Machine>::TypeForSize[] = { |
| 5087 IceType_i8, IceType_i16, IceType_i32, |
| 5088 (Traits::Is64Bit ? IceType_i64 : IceType_f64), IceType_v16i8}; |
| 5089 template <class Machine> |
| 5090 Type TargetX86Base<Machine>::largestTypeInSize(uint32_t Size, |
| 5091 uint32_t MaxSize) { |
| 5092 assert(Size != 0); |
| 5093 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined); |
| 5094 uint32_t MaxIndex = MaxSize == NoSizeLimit |
| 5095 ? llvm::array_lengthof(TypeForSize) - 1 |
| 5096 : llvm::findLastSet(MaxSize, llvm::ZB_Undefined); |
| 5097 return TypeForSize[std::min(TyIndex, MaxIndex)]; |
| 5098 } |
| 5099 |
| 5100 template <class Machine> |
| 5101 Type TargetX86Base<Machine>::firstTypeThatFitsSize(uint32_t Size, |
| 5102 uint32_t MaxSize) { |
| 5103 assert(Size != 0); |
| 5104 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined); |
| 5105 if (!llvm::isPowerOf2_32(Size)) |
| 5106 ++TyIndex; |
| 5107 uint32_t MaxIndex = MaxSize == NoSizeLimit |
| 5108 ? llvm::array_lengthof(TypeForSize) - 1 |
| 5109 : llvm::findLastSet(MaxSize, llvm::ZB_Undefined); |
| 5110 return TypeForSize[std::min(TyIndex, MaxIndex)]; |
| 5111 } |
| 5112 |
| 5056 template <class Machine> void TargetX86Base<Machine>::postLower() { | 5113 template <class Machine> void TargetX86Base<Machine>::postLower() { |
| 5057 if (Ctx->getFlags().getOptLevel() == Opt_m1) | 5114 if (Ctx->getFlags().getOptLevel() == Opt_m1) |
| 5058 return; | 5115 return; |
| 5059 inferTwoAddress(); | 5116 inferTwoAddress(); |
| 5060 } | 5117 } |
| 5061 | 5118 |
| 5062 template <class Machine> | 5119 template <class Machine> |
| 5063 void TargetX86Base<Machine>::makeRandomRegisterPermutation( | 5120 void TargetX86Base<Machine>::makeRandomRegisterPermutation( |
| 5064 llvm::SmallVectorImpl<int32_t> &Permutation, | 5121 llvm::SmallVectorImpl<int32_t> &Permutation, |
| 5065 const llvm::SmallBitVector &ExcludeRegisters) const { | 5122 const llvm::SmallBitVector &ExcludeRegisters) const { |
| (...skipping 240 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5306 } | 5363 } |
| 5307 // the offset is not eligible for blinding or pooling, return the original | 5364 // the offset is not eligible for blinding or pooling, return the original |
| 5308 // mem operand | 5365 // mem operand |
| 5309 return MemOperand; | 5366 return MemOperand; |
| 5310 } | 5367 } |
| 5311 | 5368 |
| 5312 } // end of namespace X86Internal | 5369 } // end of namespace X86Internal |
| 5313 } // end of namespace Ice | 5370 } // end of namespace Ice |
| 5314 | 5371 |
| 5315 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H | 5372 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H |
| OLD | NEW |