src/IceTargetLoweringX86BaseImpl.h - Issue 1278173009: Inline memove for small constant sizes and refactor memcpy and memset.

Side by Side Diff: src/IceTargetLoweringX86BaseImpl.h

Issue 1278173009: Inline memove for small constant sizes and refactor memcpy and memset. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master

Patch Set: Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -- C++ --==//	1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -- C++ --==//

2 //	2 //

3 // The Subzero Code Generator	3 // The Subzero Code Generator

4 //	4 //

5 // This file is distributed under the University of Illinois Open Source	5 // This file is distributed under the University of Illinois Open Source

6 // License. See LICENSE.TXT for details.	6 // License. See LICENSE.TXT for details.

7 //	7 //

8 //===----------------------------------------------------------------------===//	8 //===----------------------------------------------------------------------===//

9 ///	9 ///

10 /// \file	10 /// \file

(...skipping 3135 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3146 Call->addArg(Instr->getArg(0));	3146 Call->addArg(Instr->getArg(0));

3147 Call->addArg(Instr->getArg(1));	3147 Call->addArg(Instr->getArg(1));

3148 lowerCall(Call);	3148 lowerCall(Call);

3149 return;	3149 return;

3150 }	3150 }

3151 case Intrinsics::Memcpy: {	3151 case Intrinsics::Memcpy: {

3152 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));	3152 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));

3153 return;	3153 return;

3154 }	3154 }

3155 case Intrinsics::Memmove: {	3155 case Intrinsics::Memmove: {

3156 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);	3156 lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));

3157 Call->addArg(Instr->getArg(0));

3158 Call->addArg(Instr->getArg(1));

3159 Call->addArg(Instr->getArg(2));

3160 lowerCall(Call);

3161 return;	3157 return;

3162 }	3158 }

3163 case Intrinsics::Memset: {	3159 case Intrinsics::Memset: {

3164 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));	3160 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));

3165 return;	3161 return;

3166 }	3162 }

3167 case Intrinsics::NaClReadTP: {	3163 case Intrinsics::NaClReadTP: {

3168 if (Ctx->getFlags().getUseSandboxing()) {	3164 if (Ctx->getFlags().getUseSandboxing()) {

3169 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand);	3165 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand);

3170 Variable *Dest = Instr->getDest();	3166 Variable *Dest = Instr->getDest();

(...skipping 422 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3593 _bsr(T_Dest2, SecondVar);	3589 _bsr(T_Dest2, SecondVar);

3594 _xor(T_Dest2, ThirtyOne);	3590 _xor(T_Dest2, ThirtyOne);

3595 }	3591 }

3596 _test(SecondVar, SecondVar);	3592 _test(SecondVar, SecondVar);

3597 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e);	3593 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e);

3598 _mov(DestLo, T_Dest2);	3594 _mov(DestLo, T_Dest2);

3599 _mov(DestHi, Ctx->getConstantZero(IceType_i32));	3595 _mov(DestHi, Ctx->getConstantZero(IceType_i32));

3600 }	3596 }

3601	3597

3602 template <class Machine>	3598 template <class Machine>

	3599 void TargetX86Base<Machine>::typedLoad(Type Ty, Variable Dest, Variable Base,

	3600 Constant *Offset) {

	3601 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);

	3602

	3603 if (isVectorType(Ty))

	3604 _movp(Dest, Mem);

	3605 else if (Ty == IceType_f64)

	3606 _movq(Dest, Mem);

	3607 else

	3608 _mov(Dest, Mem);

	3609 }

	3610

	3611 template <class Machine>

	3612 void TargetX86Base<Machine>::typedStore(Type Ty, Variable *Value,

	3613 Variable Base, Constant Offset) {

	3614 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);

	3615

	3616 if (isVectorType(Ty))

	3617 _storep(Value, Mem);

	3618 else if (Ty == IceType_f64)

	3619 _storeq(Value, Mem);

	3620 else

	3621 _store(Value, Mem);

	3622 }

	3623

	3624 template <class Machine>

	3625 void TargetX86Base<Machine>::copyMemory(Type Ty, Variable Dest, Variable Src,

	3626 int32_t OffsetAmt) {

	3627 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;

	3628 // TODO(ascull): this or add nullptr test to _movp, _movq

	3629 Variable *Data = makeReg(Ty);

	3630

	3631 typedLoad(Ty, Data, Src, Offset);

	3632 typedStore(Ty, Data, Dest, Offset);

	3633 }

	3634

	3635 template <class Machine>

3603 void TargetX86Base<Machine>::lowerMemcpy(Operand Dest, Operand Src,	3636 void TargetX86Base<Machine>::lowerMemcpy(Operand Dest, Operand Src,

3604 Operand *Count) {	3637 Operand *Count) {

3605 // There is a load and store for each chunk in the unroll	3638 // There is a load and store for each chunk in the unroll

3606 constexpr uint32_t UNROLL_LIMIT = 8;

3607 constexpr uint32_t BytesPerStorep = 16;	3639 constexpr uint32_t BytesPerStorep = 16;

3608 constexpr uint32_t BytesPerStoreq = 8;

3609 constexpr uint32_t BytesPerStorei32 = 4;

3610 constexpr uint32_t BytesPerStorei16 = 2;

3611 constexpr uint32_t BytesPerStorei8 = 1;

3612	3640

3613 // Check if the operands are constants	3641 // Check if the operands are constants

3614 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);	3642 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);

3615 const bool IsCountConst = CountConst != nullptr;	3643 const bool IsCountConst = CountConst != nullptr;

3616 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;	3644 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;

3617	3645

3618 if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) {	3646 if (shouldOptimizeMemIntrins() && IsCountConst &&

	3647 CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {

3619 // Unlikely, but nothing to do if it does happen	3648 // Unlikely, but nothing to do if it does happen

3620 if (CountValue == 0)	3649 if (CountValue == 0)

3621 return;	3650 return;

3622	3651

3623 Variable *SrcBase = legalizeToReg(Src);	3652 Variable *SrcBase = legalizeToReg(Src);

3624 Variable *DestBase = legalizeToReg(Dest);	3653 Variable *DestBase = legalizeToReg(Dest);

3625	3654

3626 auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) {	3655 // Find the largest type that can be used and use it as much as possible in

3627 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;	3656 // reverse order. Then handle any remainder with overlapping copies. Since

3628 // TODO(ascull): this or add nullptr test to _movp, _movq	3657 // the remainder will be at the end, there will be reduced pressure on the

3629 Variable *Data = makeReg(Ty);	3658 // memory unit as the accesses to the same memory are far apart.

	3659 Type Ty = largestTypeInSize(CountValue);

	3660 uint32_t TyWidth = typeWidthInBytes(Ty);

3630	3661

3631 // TODO(ascull): is 64-bit better with vector or scalar movq?	3662 uint32_t RemainingBytes = CountValue;

3632 auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset);	3663 int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;

3633 if (isVectorType(Ty))	3664 while (RemainingBytes >= TyWidth) {

3634 _movp(Data, SrcMem);	3665 copyMemory(Ty, DestBase, SrcBase, Offset);

3635 else if (Ty == IceType_f64)	3666 RemainingBytes -= TyWidth;

3636 _movq(Data, SrcMem);	3667 Offset -= TyWidth;

3637 else

3638 _mov(Data, SrcMem);

3639

3640 auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset);

3641 if (isVectorType(Ty))

3642 _storep(Data, DestMem);

3643 else if (Ty == IceType_f64)

3644 _storeq(Data, DestMem);

3645 else

3646 _store(Data, DestMem);

3647 };

3648

3649 // Lowers the assignment to the remaining bytes. Assumes the original size

3650 // was large enough to allow for overlaps.

3651 auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) {

3652 if (Size > BytesPerStoreq) {

3653 lowerCopy(IceType_v16i8, CountValue - BytesPerStorep);

3654 } else if (Size > BytesPerStorei32) {

3655 lowerCopy(IceType_f64, CountValue - BytesPerStoreq);

3656 } else if (Size > BytesPerStorei16) {

3657 lowerCopy(IceType_i32, CountValue - BytesPerStorei32);

3658 } else if (Size > BytesPerStorei8) {

3659 lowerCopy(IceType_i16, CountValue - BytesPerStorei16);

3660 } else if (Size == BytesPerStorei8) {

3661 lowerCopy(IceType_i8, CountValue - BytesPerStorei8);

3662 }

3663 };

3664

3665 if (CountValue >= BytesPerStorep) {

3666 // Use large vector operations

3667 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {

3668 N -= BytesPerStorep;

3669 lowerCopy(IceType_v16i8, N);

3670 }

3671 lowerLeftOvers(CountValue & 0xF);

3672 return;

3673 }	3668 }

3674	3669

3675 // Too small to use large vector operations so use small ones instead	3670 if (RemainingBytes == 0)

3676 if (CountValue >= BytesPerStoreq) {

3677 lowerCopy(IceType_f64, 0);

3678 lowerLeftOvers(CountValue - BytesPerStoreq);

3679 return;	3671 return;

3680 }

3681	3672

3682 // Too small for vector operations so use scalar ones	3673 // Lower the remaining bytes. Adjust to larger types in order to make use

3683 if (CountValue >= BytesPerStorei32) {	3674 // of overlaps in the copies.

3684 lowerCopy(IceType_i32, 0);	3675 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);

3685 lowerLeftOvers(CountValue - BytesPerStorei32);	3676 Offset = CountValue - typeWidthInBytes(LeftOverTy);

3686 return;	3677 copyMemory(LeftOverTy, DestBase, SrcBase, Offset);

3687 }

3688

3689 // 3 is the awkward size as it is too small for the vector or 32-bit

3690 // operations and will not work with lowerLeftOvers as there is no valid

3691 // overlap.

3692 if (CountValue == 3) {

3693 lowerCopy(IceType_i16, 0);

3694 lowerCopy(IceType_i8, 2);

3695 return;

3696 }

3697

3698 // 1 or 2 can be done in a single scalar copy

3699 lowerLeftOvers(CountValue);

3700 return;	3678 return;

3701 }	3679 }

3702	3680

3703 // Fall back on a function call	3681 // Fall back on a function call

3704 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);	3682 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);

3705 Call->addArg(Dest);	3683 Call->addArg(Dest);

3706 Call->addArg(Src);	3684 Call->addArg(Src);

3707 Call->addArg(Count);	3685 Call->addArg(Count);

	3686 lowerCall(Call);

	3687 }

	3688

	3689 template <class Machine>

	3690 void TargetX86Base<Machine>::lowerMemmove(Operand Dest, Operand Src,

	3691 Operand *Count) {

	3692 // There is a load and store for each chunk in the unroll

	3693 constexpr uint32_t BytesPerStorep = 16;

	3694

	3695 // Check if the operands are constants

	3696 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);

	3697 const bool IsCountConst = CountConst != nullptr;

	3698 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;

	3699

	3700 if (shouldOptimizeMemIntrins() && IsCountConst &&

	3701 CountValue <= BytesPerStorep * Traits::MEMMOVE_UNROLL_LIMIT) {

	3702 // Unlikely, but nothing to do if it does happen

	3703 if (CountValue == 0)

	3704 return;

	3705

	3706 Variable *SrcBase = legalizeToReg(Src);

	3707 Variable *DestBase = legalizeToReg(Dest);

	3708

	3709 std::tuple<Type, Constant , Variable >

	3710 Moves[Traits::MEMMOVE_UNROLL_LIMIT];

	3711 Constant *Offset;

	3712 Variable *Reg;

	3713

	3714 // Copy the data into registers as the source and destination could overlap

	3715 // so make sure not to clobber the memory. This also means overlapping moves

	3716 // can be used as we are taking a safe snapshot of the memory.

	3717 Type Ty = largestTypeInSize(CountValue);

	3718 uint32_t TyWidth = typeWidthInBytes(Ty);

	3719

	3720 uint32_t RemainingBytes = CountValue;

	3721 int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;

	3722 size_t N = 0;

	3723 while (RemainingBytes >= TyWidth) {

	3724 assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);

	3725 Offset = Ctx->getConstantInt32(OffsetAmt);

	3726 Reg = makeReg(Ty);

	3727 typedLoad(Ty, Reg, SrcBase, Offset);

	3728 RemainingBytes -= TyWidth;

	3729 OffsetAmt -= TyWidth;

	3730 Moves[N++] = std::make_tuple(Ty, Offset, Reg);

	3731 }

	3732

	3733 if (RemainingBytes != 0) {

	3734 // Lower the remaining bytes. Adjust to larger types in order to make use

	3735 // of overlaps in the copies.

	3736 assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);

	3737 Ty = firstTypeThatFitsSize(RemainingBytes);

	3738 Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));

	3739 Reg = makeReg(Ty);

	3740 typedLoad(Ty, Reg, SrcBase, Offset);

	3741 Moves[N++] = std::make_tuple(Ty, Offset, Reg);

	3742 }

	3743

	3744 // Copy the data out into the destination memory

	3745 for (size_t i = 0; i < N; ++i) {

	3746 std::tie(Ty, Offset, Reg) = Moves[i];

	3747 typedStore(Ty, Reg, DestBase, Offset);

	3748 }

	3749

	3750 return;

	3751 }

	3752

	3753 // Fall back on a function call

	3754 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);

	3755 Call->addArg(Dest);

	3756 Call->addArg(Src);

	3757 Call->addArg(Count);

3708 lowerCall(Call);	3758 lowerCall(Call);

3709 }	3759 }

3710	3760

3711 template <class Machine>	3761 template <class Machine>

3712 void TargetX86Base<Machine>::lowerMemset(Operand Dest, Operand Val,	3762 void TargetX86Base<Machine>::lowerMemset(Operand Dest, Operand Val,

3713 Operand *Count) {	3763 Operand *Count) {

3714 constexpr uint32_t UNROLL_LIMIT = 16;

3715 constexpr uint32_t BytesPerStorep = 16;	3764 constexpr uint32_t BytesPerStorep = 16;

3716 constexpr uint32_t BytesPerStoreq = 8;	3765 constexpr uint32_t BytesPerStoreq = 8;

3717 constexpr uint32_t BytesPerStorei32 = 4;	3766 constexpr uint32_t BytesPerStorei32 = 4;

3718 constexpr uint32_t BytesPerStorei16 = 2;

3719 constexpr uint32_t BytesPerStorei8 = 1;

3720 assert(Val->getType() == IceType_i8);	3767 assert(Val->getType() == IceType_i8);

3721	3768

3722 // Check if the operands are constants	3769 // Check if the operands are constants

3723 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);	3770 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);

3724 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);	3771 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);

3725 const bool IsCountConst = CountConst != nullptr;	3772 const bool IsCountConst = CountConst != nullptr;

3726 const bool IsValConst = ValConst != nullptr;	3773 const bool IsValConst = ValConst != nullptr;

3727 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;	3774 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;

3728 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;	3775 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;

3729	3776

3730 // Unlikely, but nothing to do if it does happen	3777 // Unlikely, but nothing to do if it does happen

3731 if (IsCountConst && CountValue == 0)	3778 if (IsCountConst && CountValue == 0)

3732 return;	3779 return;

3733	3780

3734 // TODO(ascull): if the count is constant but val is not it would be possible	3781 // TODO(ascull): if the count is constant but val is not it would be possible

3735 // to inline by spreading the value across 4 bytes and accessing subregs e.g.	3782 // to inline by spreading the value across 4 bytes and accessing subregs e.g.

3736 // eax, ax and al.	3783 // eax, ax and al.

3737 if (IsCountConst && IsValConst) {	3784 if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {

3738 Variable *Base = nullptr;	3785 Variable *Base = nullptr;

	3786 Variable *VecReg = nullptr;

3739 const uint32_t SpreadValue =	3787 const uint32_t SpreadValue =

3740 (ValValue << 24) \| (ValValue << 16) \| (ValValue << 8) \| ValValue;	3788 (ValValue << 24) \| (ValValue << 16) \| (ValValue << 8) \| ValValue;

3741 Variable *VecReg = nullptr;

3742	3789

3743 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,	3790 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,

3744 uint32_t OffsetAmt) {	3791 uint32_t OffsetAmt) {

3745 assert(Base != nullptr);	3792 assert(Base != nullptr);

3746 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;	3793 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;

3747	3794

3748 // TODO(ascull): is 64-bit better with vector or scalar movq?	3795 // TODO(ascull): is 64-bit better with vector or scalar movq?

3749 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);	3796 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);

3750 if (isVectorType(Ty)) {	3797 if (isVectorType(Ty)) {

3751 assert(VecReg != nullptr);	3798 assert(VecReg != nullptr);

3752 _storep(VecReg, Mem);	3799 _storep(VecReg, Mem);

3753 } else if (Ty == IceType_i64) {	3800 } else if (Ty == IceType_f64) {

3754 assert(VecReg != nullptr);	3801 assert(VecReg != nullptr);

3755 _storeq(VecReg, Mem);	3802 _storeq(VecReg, Mem);

3756 } else {	3803 } else {

3757 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);	3804 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);

3758 }	3805 }

3759 };	3806 };

3760	3807

3761 // Lowers the assignment to the remaining bytes. Assumes the original size	3808 // Find the largest type that can be used and use it as much as possible in

3762 // was large enough to allow for overlaps.	3809 // reverse order. Then handle any remainder with overlapping copies. Since

3763 auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) {	3810 // the remainder will be at the end, there will be reduces pressure on the

3764 if (Size > BytesPerStoreq) {	3811 // memory unit as the access to the same memory are far apart.

3765 lowerSet(IceType_v16i8, CountValue - BytesPerStorep);	3812 Type Ty;

3766 } else if (Size > BytesPerStorei32) {

3767 lowerSet(IceType_i64, CountValue - BytesPerStoreq);

3768 } else if (Size > BytesPerStorei16) {

3769 lowerSet(IceType_i32, CountValue - BytesPerStorei32);

3770 } else if (Size > BytesPerStorei8) {

3771 lowerSet(IceType_i16, CountValue - BytesPerStorei16);

3772 } else if (Size == BytesPerStorei8) {

3773 lowerSet(IceType_i8, CountValue - BytesPerStorei8);

3774 }

3775 };

3776

3777 // When the value is zero it can be loaded into a vector register cheaply

3778 // using the xor trick.

3779 if (ValValue == 0 && CountValue >= BytesPerStoreq &&	3813 if (ValValue == 0 && CountValue >= BytesPerStoreq &&

3780 CountValue <= BytesPerStorep * UNROLL_LIMIT) {	3814 CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {

	3815 // When the value is zero it can be loaded into a vector register cheaply

	3816 // using the xor trick.

3781 Base = legalizeToReg(Dest);	3817 Base = legalizeToReg(Dest);

3782 VecReg = makeVectorOfZeros(IceType_v16i8);	3818 VecReg = makeVectorOfZeros(IceType_v16i8);

	3819 Ty = largestTypeInSize(CountValue);

	3820 } else if (CountValue <= BytesPerStorei32 * Traits::MEMCPY_UNROLL_LIMIT) {

	3821 // When the value is non-zero or the count is small we can't use vector

	3822 // instructions so are limited to 32-bit stores.

	3823 Base = legalizeToReg(Dest);

	3824 constexpr uint32_t MaxSize = 4;

	3825 Ty = largestTypeInSize(CountValue, MaxSize);

	3826 }

3783	3827

3784 // Too small to use large vector operations so use small ones instead	3828 if (Base) {

3785 if (CountValue < BytesPerStorep) {	3829 uint32_t TyWidth = typeWidthInBytes(Ty);

3786 lowerSet(IceType_i64, 0);	3830

3787 lowerLeftOvers(CountValue - BytesPerStoreq);	3831 uint32_t RemainingBytes = CountValue;

3788 return;	3832 uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;

	3833 while (RemainingBytes >= TyWidth) {

	3834 lowerSet(Ty, Offset);

	3835 RemainingBytes -= TyWidth;

	3836 Offset -= TyWidth;

3789 }	3837 }

3790	3838

3791 // Use large vector operations	3839 if (RemainingBytes == 0)

3792 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {	3840 return;

3793 N -= 16;

3794 lowerSet(IceType_v16i8, N);

3795 }

3796 lowerLeftOvers(CountValue & 0xF);

3797 return;

3798 }

3799	3841

3800 // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al?	3842 // Lower the remaining bytes. Adjust to larger types in order to make use

3801 if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) {	3843 // of overlaps in the copies.

3802 Base = legalizeToReg(Dest);	3844 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);

3803 // 3 is the awkward size as it is too small for the vector or 32-bit	3845 Offset = CountValue - typeWidthInBytes(LeftOverTy);

3804 // operations and will not work with lowerLeftOvers as there is no valid	3846 lowerSet(LeftOverTy, Offset);

3805 // overlap.

3806 if (CountValue == 3) {

3807 lowerSet(IceType_i16, 0);

3808 lowerSet(IceType_i8, 2);

3809 return;

3810 }

3811

3812 // TODO(ascull); 64-bit can do better with 64-bit mov

3813 for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) {

3814 N -= 4;

3815 lowerSet(IceType_i32, N);

3816 }

3817 lowerLeftOvers(CountValue & 0x3);

3818 return;	3847 return;

3819 }	3848 }

3820 }	3849 }

3821	3850

3822 // Fall back on calling the memset function. The value operand needs to be	3851 // Fall back on calling the memset function. The value operand needs to be

3823 // extended to a stack slot size because the PNaCl ABI requires arguments to	3852 // extended to a stack slot size because the PNaCl ABI requires arguments to

3824 // be at least 32 bits wide.	3853 // be at least 32 bits wide.

3825 Operand *ValExt;	3854 Operand *ValExt;

3826 if (IsValConst) {	3855 if (IsValConst) {

3827 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);	3856 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);

(...skipping 1218 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5046 // There aren't any 64-bit integer registers for x86-32.	5075 // There aren't any 64-bit integer registers for x86-32.

5047 assert(Traits::Is64Bit \|\| Type != IceType_i64);	5076 assert(Traits::Is64Bit \|\| Type != IceType_i64);

5048 Variable *Reg = Func->makeVariable(Type);	5077 Variable *Reg = Func->makeVariable(Type);

5049 if (RegNum == Variable::NoRegister)	5078 if (RegNum == Variable::NoRegister)

5050 Reg->setWeightInfinite();	5079 Reg->setWeightInfinite();

5051 else	5080 else

5052 Reg->setRegNum(RegNum);	5081 Reg->setRegNum(RegNum);

5053 return Reg;	5082 return Reg;

5054 }	5083 }

5055	5084

	5085 template <class Machine>

	5086 const Type TargetX86Base<Machine>::TypeForSize[] = {

	5087 IceType_i8, IceType_i16, IceType_i32,

	5088 (Traits::Is64Bit ? IceType_i64 : IceType_f64), IceType_v16i8};

	5089 template <class Machine>

	5090 Type TargetX86Base<Machine>::largestTypeInSize(uint32_t Size,

	5091 uint32_t MaxSize) {

	5092 assert(Size != 0);

	5093 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);

	5094 uint32_t MaxIndex = MaxSize == NoSizeLimit

	5095 ? llvm::array_lengthof(TypeForSize) - 1

	5096 : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);

	5097 return TypeForSize[std::min(TyIndex, MaxIndex)];

	5098 }

	5099

	5100 template <class Machine>

	5101 Type TargetX86Base<Machine>::firstTypeThatFitsSize(uint32_t Size,

	5102 uint32_t MaxSize) {

	5103 assert(Size != 0);

	5104 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);

	5105 if (!llvm::isPowerOf2_32(Size))

	5106 ++TyIndex;

	5107 uint32_t MaxIndex = MaxSize == NoSizeLimit

	5108 ? llvm::array_lengthof(TypeForSize) - 1

	5109 : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);

	5110 return TypeForSize[std::min(TyIndex, MaxIndex)];

	5111 }

	5112

5056 template <class Machine> void TargetX86Base<Machine>::postLower() {	5113 template <class Machine> void TargetX86Base<Machine>::postLower() {

5057 if (Ctx->getFlags().getOptLevel() == Opt_m1)	5114 if (Ctx->getFlags().getOptLevel() == Opt_m1)

5058 return;	5115 return;

5059 inferTwoAddress();	5116 inferTwoAddress();

5060 }	5117 }

5061	5118

5062 template <class Machine>	5119 template <class Machine>

5063 void TargetX86Base<Machine>::makeRandomRegisterPermutation(	5120 void TargetX86Base<Machine>::makeRandomRegisterPermutation(

5064 llvm::SmallVectorImpl<int32_t> &Permutation,	5121 llvm::SmallVectorImpl<int32_t> &Permutation,

5065 const llvm::SmallBitVector &ExcludeRegisters) const {	5122 const llvm::SmallBitVector &ExcludeRegisters) const {

(...skipping 240 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5306 }	5363 }

5307 // the offset is not eligible for blinding or pooling, return the original	5364 // the offset is not eligible for blinding or pooling, return the original

5308 // mem operand	5365 // mem operand

5309 return MemOperand;	5366 return MemOperand;

5310 }	5367 }

5311	5368

5312 } // end of namespace X86Internal	5369 } // end of namespace X86Internal

5313 } // end of namespace Ice	5370 } // end of namespace Ice

5314	5371

5315 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H	5372 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H

OLD	NEW

« no previous file with comments | « src/IceTargetLoweringX86Base.h ('k') | tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll » ('j') | no next file with comments »