Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(355)

Side by Side Diff: src/IceTargetLoweringX86BaseImpl.h

Issue 1278173009: Inline memove for small constant sizes and refactor memcpy and memset. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master
Patch Set: Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/IceTargetLoweringX86Base.h ('k') | tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// 1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==//
2 // 2 //
3 // The Subzero Code Generator 3 // The Subzero Code Generator
4 // 4 //
5 // This file is distributed under the University of Illinois Open Source 5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details. 6 // License. See LICENSE.TXT for details.
7 // 7 //
8 //===----------------------------------------------------------------------===// 8 //===----------------------------------------------------------------------===//
9 /// 9 ///
10 /// \file 10 /// \file
(...skipping 3135 matching lines...) Expand 10 before | Expand all | Expand 10 after
3146 Call->addArg(Instr->getArg(0)); 3146 Call->addArg(Instr->getArg(0));
3147 Call->addArg(Instr->getArg(1)); 3147 Call->addArg(Instr->getArg(1));
3148 lowerCall(Call); 3148 lowerCall(Call);
3149 return; 3149 return;
3150 } 3150 }
3151 case Intrinsics::Memcpy: { 3151 case Intrinsics::Memcpy: {
3152 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); 3152 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
3153 return; 3153 return;
3154 } 3154 }
3155 case Intrinsics::Memmove: { 3155 case Intrinsics::Memmove: {
3156 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3); 3156 lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
3157 Call->addArg(Instr->getArg(0));
3158 Call->addArg(Instr->getArg(1));
3159 Call->addArg(Instr->getArg(2));
3160 lowerCall(Call);
3161 return; 3157 return;
3162 } 3158 }
3163 case Intrinsics::Memset: { 3159 case Intrinsics::Memset: {
3164 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); 3160 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
3165 return; 3161 return;
3166 } 3162 }
3167 case Intrinsics::NaClReadTP: { 3163 case Intrinsics::NaClReadTP: {
3168 if (Ctx->getFlags().getUseSandboxing()) { 3164 if (Ctx->getFlags().getUseSandboxing()) {
3169 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand); 3165 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand);
3170 Variable *Dest = Instr->getDest(); 3166 Variable *Dest = Instr->getDest();
(...skipping 422 matching lines...) Expand 10 before | Expand all | Expand 10 after
3593 _bsr(T_Dest2, SecondVar); 3589 _bsr(T_Dest2, SecondVar);
3594 _xor(T_Dest2, ThirtyOne); 3590 _xor(T_Dest2, ThirtyOne);
3595 } 3591 }
3596 _test(SecondVar, SecondVar); 3592 _test(SecondVar, SecondVar);
3597 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e); 3593 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e);
3598 _mov(DestLo, T_Dest2); 3594 _mov(DestLo, T_Dest2);
3599 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); 3595 _mov(DestHi, Ctx->getConstantZero(IceType_i32));
3600 } 3596 }
3601 3597
3602 template <class Machine> 3598 template <class Machine>
3599 void TargetX86Base<Machine>::typedLoad(Type Ty, Variable *Dest, Variable *Base,
3600 Constant *Offset) {
3601 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
3602
3603 if (isVectorType(Ty))
3604 _movp(Dest, Mem);
3605 else if (Ty == IceType_f64)
3606 _movq(Dest, Mem);
3607 else
3608 _mov(Dest, Mem);
3609 }
3610
3611 template <class Machine>
3612 void TargetX86Base<Machine>::typedStore(Type Ty, Variable *Value,
3613 Variable *Base, Constant *Offset) {
3614 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
3615
3616 if (isVectorType(Ty))
3617 _storep(Value, Mem);
3618 else if (Ty == IceType_f64)
3619 _storeq(Value, Mem);
3620 else
3621 _store(Value, Mem);
3622 }
3623
3624 template <class Machine>
3625 void TargetX86Base<Machine>::copyMemory(Type Ty, Variable *Dest, Variable *Src,
3626 int32_t OffsetAmt) {
3627 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
3628 // TODO(ascull): this or add nullptr test to _movp, _movq
3629 Variable *Data = makeReg(Ty);
3630
3631 typedLoad(Ty, Data, Src, Offset);
3632 typedStore(Ty, Data, Dest, Offset);
3633 }
3634
3635 template <class Machine>
3603 void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src, 3636 void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src,
3604 Operand *Count) { 3637 Operand *Count) {
3605 // There is a load and store for each chunk in the unroll 3638 // There is a load and store for each chunk in the unroll
3606 constexpr uint32_t UNROLL_LIMIT = 8;
3607 constexpr uint32_t BytesPerStorep = 16; 3639 constexpr uint32_t BytesPerStorep = 16;
3608 constexpr uint32_t BytesPerStoreq = 8;
3609 constexpr uint32_t BytesPerStorei32 = 4;
3610 constexpr uint32_t BytesPerStorei16 = 2;
3611 constexpr uint32_t BytesPerStorei8 = 1;
3612 3640
3613 // Check if the operands are constants 3641 // Check if the operands are constants
3614 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); 3642 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
3615 const bool IsCountConst = CountConst != nullptr; 3643 const bool IsCountConst = CountConst != nullptr;
3616 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; 3644 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
3617 3645
3618 if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) { 3646 if (shouldOptimizeMemIntrins() && IsCountConst &&
3647 CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {
3619 // Unlikely, but nothing to do if it does happen 3648 // Unlikely, but nothing to do if it does happen
3620 if (CountValue == 0) 3649 if (CountValue == 0)
3621 return; 3650 return;
3622 3651
3623 Variable *SrcBase = legalizeToReg(Src); 3652 Variable *SrcBase = legalizeToReg(Src);
3624 Variable *DestBase = legalizeToReg(Dest); 3653 Variable *DestBase = legalizeToReg(Dest);
3625 3654
3626 auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) { 3655 // Find the largest type that can be used and use it as much as possible in
3627 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; 3656 // reverse order. Then handle any remainder with overlapping copies. Since
3628 // TODO(ascull): this or add nullptr test to _movp, _movq 3657 // the remainder will be at the end, there will be reduced pressure on the
3629 Variable *Data = makeReg(Ty); 3658 // memory unit as the accesses to the same memory are far apart.
3659 Type Ty = largestTypeInSize(CountValue);
3660 uint32_t TyWidth = typeWidthInBytes(Ty);
3630 3661
3631 // TODO(ascull): is 64-bit better with vector or scalar movq? 3662 uint32_t RemainingBytes = CountValue;
3632 auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset); 3663 int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
3633 if (isVectorType(Ty)) 3664 while (RemainingBytes >= TyWidth) {
3634 _movp(Data, SrcMem); 3665 copyMemory(Ty, DestBase, SrcBase, Offset);
3635 else if (Ty == IceType_f64) 3666 RemainingBytes -= TyWidth;
3636 _movq(Data, SrcMem); 3667 Offset -= TyWidth;
3637 else
3638 _mov(Data, SrcMem);
3639
3640 auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset);
3641 if (isVectorType(Ty))
3642 _storep(Data, DestMem);
3643 else if (Ty == IceType_f64)
3644 _storeq(Data, DestMem);
3645 else
3646 _store(Data, DestMem);
3647 };
3648
3649 // Lowers the assignment to the remaining bytes. Assumes the original size
3650 // was large enough to allow for overlaps.
3651 auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) {
3652 if (Size > BytesPerStoreq) {
3653 lowerCopy(IceType_v16i8, CountValue - BytesPerStorep);
3654 } else if (Size > BytesPerStorei32) {
3655 lowerCopy(IceType_f64, CountValue - BytesPerStoreq);
3656 } else if (Size > BytesPerStorei16) {
3657 lowerCopy(IceType_i32, CountValue - BytesPerStorei32);
3658 } else if (Size > BytesPerStorei8) {
3659 lowerCopy(IceType_i16, CountValue - BytesPerStorei16);
3660 } else if (Size == BytesPerStorei8) {
3661 lowerCopy(IceType_i8, CountValue - BytesPerStorei8);
3662 }
3663 };
3664
3665 if (CountValue >= BytesPerStorep) {
3666 // Use large vector operations
3667 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {
3668 N -= BytesPerStorep;
3669 lowerCopy(IceType_v16i8, N);
3670 }
3671 lowerLeftOvers(CountValue & 0xF);
3672 return;
3673 } 3668 }
3674 3669
3675 // Too small to use large vector operations so use small ones instead 3670 if (RemainingBytes == 0)
3676 if (CountValue >= BytesPerStoreq) {
3677 lowerCopy(IceType_f64, 0);
3678 lowerLeftOvers(CountValue - BytesPerStoreq);
3679 return; 3671 return;
3680 }
3681 3672
3682 // Too small for vector operations so use scalar ones 3673 // Lower the remaining bytes. Adjust to larger types in order to make use
3683 if (CountValue >= BytesPerStorei32) { 3674 // of overlaps in the copies.
3684 lowerCopy(IceType_i32, 0); 3675 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
3685 lowerLeftOvers(CountValue - BytesPerStorei32); 3676 Offset = CountValue - typeWidthInBytes(LeftOverTy);
3686 return; 3677 copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
3687 }
3688
3689 // 3 is the awkward size as it is too small for the vector or 32-bit
3690 // operations and will not work with lowerLeftOvers as there is no valid
3691 // overlap.
3692 if (CountValue == 3) {
3693 lowerCopy(IceType_i16, 0);
3694 lowerCopy(IceType_i8, 2);
3695 return;
3696 }
3697
3698 // 1 or 2 can be done in a single scalar copy
3699 lowerLeftOvers(CountValue);
3700 return; 3678 return;
3701 } 3679 }
3702 3680
3703 // Fall back on a function call 3681 // Fall back on a function call
3704 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3); 3682 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);
3705 Call->addArg(Dest); 3683 Call->addArg(Dest);
3706 Call->addArg(Src); 3684 Call->addArg(Src);
3707 Call->addArg(Count); 3685 Call->addArg(Count);
3686 lowerCall(Call);
3687 }
3688
3689 template <class Machine>
3690 void TargetX86Base<Machine>::lowerMemmove(Operand *Dest, Operand *Src,
3691 Operand *Count) {
3692 // There is a load and store for each chunk in the unroll
3693 constexpr uint32_t BytesPerStorep = 16;
3694
3695 // Check if the operands are constants
3696 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
3697 const bool IsCountConst = CountConst != nullptr;
3698 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
3699
3700 if (shouldOptimizeMemIntrins() && IsCountConst &&
3701 CountValue <= BytesPerStorep * Traits::MEMMOVE_UNROLL_LIMIT) {
3702 // Unlikely, but nothing to do if it does happen
3703 if (CountValue == 0)
3704 return;
3705
3706 Variable *SrcBase = legalizeToReg(Src);
3707 Variable *DestBase = legalizeToReg(Dest);
3708
3709 std::tuple<Type, Constant *, Variable *>
3710 Moves[Traits::MEMMOVE_UNROLL_LIMIT];
3711 Constant *Offset;
3712 Variable *Reg;
3713
3714 // Copy the data into registers as the source and destination could overlap
3715 // so make sure not to clobber the memory. This also means overlapping moves
3716 // can be used as we are taking a safe snapshot of the memory.
3717 Type Ty = largestTypeInSize(CountValue);
3718 uint32_t TyWidth = typeWidthInBytes(Ty);
3719
3720 uint32_t RemainingBytes = CountValue;
3721 int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
3722 size_t N = 0;
3723 while (RemainingBytes >= TyWidth) {
3724 assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
3725 Offset = Ctx->getConstantInt32(OffsetAmt);
3726 Reg = makeReg(Ty);
3727 typedLoad(Ty, Reg, SrcBase, Offset);
3728 RemainingBytes -= TyWidth;
3729 OffsetAmt -= TyWidth;
3730 Moves[N++] = std::make_tuple(Ty, Offset, Reg);
3731 }
3732
3733 if (RemainingBytes != 0) {
3734 // Lower the remaining bytes. Adjust to larger types in order to make use
3735 // of overlaps in the copies.
3736 assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
3737 Ty = firstTypeThatFitsSize(RemainingBytes);
3738 Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
3739 Reg = makeReg(Ty);
3740 typedLoad(Ty, Reg, SrcBase, Offset);
3741 Moves[N++] = std::make_tuple(Ty, Offset, Reg);
3742 }
3743
3744 // Copy the data out into the destination memory
3745 for (size_t i = 0; i < N; ++i) {
3746 std::tie(Ty, Offset, Reg) = Moves[i];
3747 typedStore(Ty, Reg, DestBase, Offset);
3748 }
3749
3750 return;
3751 }
3752
3753 // Fall back on a function call
3754 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);
3755 Call->addArg(Dest);
3756 Call->addArg(Src);
3757 Call->addArg(Count);
3708 lowerCall(Call); 3758 lowerCall(Call);
3709 } 3759 }
3710 3760
3711 template <class Machine> 3761 template <class Machine>
3712 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, 3762 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,
3713 Operand *Count) { 3763 Operand *Count) {
3714 constexpr uint32_t UNROLL_LIMIT = 16;
3715 constexpr uint32_t BytesPerStorep = 16; 3764 constexpr uint32_t BytesPerStorep = 16;
3716 constexpr uint32_t BytesPerStoreq = 8; 3765 constexpr uint32_t BytesPerStoreq = 8;
3717 constexpr uint32_t BytesPerStorei32 = 4; 3766 constexpr uint32_t BytesPerStorei32 = 4;
3718 constexpr uint32_t BytesPerStorei16 = 2;
3719 constexpr uint32_t BytesPerStorei8 = 1;
3720 assert(Val->getType() == IceType_i8); 3767 assert(Val->getType() == IceType_i8);
3721 3768
3722 // Check if the operands are constants 3769 // Check if the operands are constants
3723 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); 3770 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
3724 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val); 3771 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
3725 const bool IsCountConst = CountConst != nullptr; 3772 const bool IsCountConst = CountConst != nullptr;
3726 const bool IsValConst = ValConst != nullptr; 3773 const bool IsValConst = ValConst != nullptr;
3727 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; 3774 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
3728 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0; 3775 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
3729 3776
3730 // Unlikely, but nothing to do if it does happen 3777 // Unlikely, but nothing to do if it does happen
3731 if (IsCountConst && CountValue == 0) 3778 if (IsCountConst && CountValue == 0)
3732 return; 3779 return;
3733 3780
3734 // TODO(ascull): if the count is constant but val is not it would be possible 3781 // TODO(ascull): if the count is constant but val is not it would be possible
3735 // to inline by spreading the value across 4 bytes and accessing subregs e.g. 3782 // to inline by spreading the value across 4 bytes and accessing subregs e.g.
3736 // eax, ax and al. 3783 // eax, ax and al.
3737 if (IsCountConst && IsValConst) { 3784 if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {
3738 Variable *Base = nullptr; 3785 Variable *Base = nullptr;
3786 Variable *VecReg = nullptr;
3739 const uint32_t SpreadValue = 3787 const uint32_t SpreadValue =
3740 (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue; 3788 (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue;
3741 Variable *VecReg = nullptr;
3742 3789
3743 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty, 3790 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
3744 uint32_t OffsetAmt) { 3791 uint32_t OffsetAmt) {
3745 assert(Base != nullptr); 3792 assert(Base != nullptr);
3746 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; 3793 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
3747 3794
3748 // TODO(ascull): is 64-bit better with vector or scalar movq? 3795 // TODO(ascull): is 64-bit better with vector or scalar movq?
3749 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); 3796 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
3750 if (isVectorType(Ty)) { 3797 if (isVectorType(Ty)) {
3751 assert(VecReg != nullptr); 3798 assert(VecReg != nullptr);
3752 _storep(VecReg, Mem); 3799 _storep(VecReg, Mem);
3753 } else if (Ty == IceType_i64) { 3800 } else if (Ty == IceType_f64) {
3754 assert(VecReg != nullptr); 3801 assert(VecReg != nullptr);
3755 _storeq(VecReg, Mem); 3802 _storeq(VecReg, Mem);
3756 } else { 3803 } else {
3757 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem); 3804 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
3758 } 3805 }
3759 }; 3806 };
3760 3807
3761 // Lowers the assignment to the remaining bytes. Assumes the original size 3808 // Find the largest type that can be used and use it as much as possible in
3762 // was large enough to allow for overlaps. 3809 // reverse order. Then handle any remainder with overlapping copies. Since
3763 auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) { 3810 // the remainder will be at the end, there will be reduces pressure on the
3764 if (Size > BytesPerStoreq) { 3811 // memory unit as the access to the same memory are far apart.
3765 lowerSet(IceType_v16i8, CountValue - BytesPerStorep); 3812 Type Ty;
3766 } else if (Size > BytesPerStorei32) {
3767 lowerSet(IceType_i64, CountValue - BytesPerStoreq);
3768 } else if (Size > BytesPerStorei16) {
3769 lowerSet(IceType_i32, CountValue - BytesPerStorei32);
3770 } else if (Size > BytesPerStorei8) {
3771 lowerSet(IceType_i16, CountValue - BytesPerStorei16);
3772 } else if (Size == BytesPerStorei8) {
3773 lowerSet(IceType_i8, CountValue - BytesPerStorei8);
3774 }
3775 };
3776
3777 // When the value is zero it can be loaded into a vector register cheaply
3778 // using the xor trick.
3779 if (ValValue == 0 && CountValue >= BytesPerStoreq && 3813 if (ValValue == 0 && CountValue >= BytesPerStoreq &&
3780 CountValue <= BytesPerStorep * UNROLL_LIMIT) { 3814 CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {
3815 // When the value is zero it can be loaded into a vector register cheaply
3816 // using the xor trick.
3781 Base = legalizeToReg(Dest); 3817 Base = legalizeToReg(Dest);
3782 VecReg = makeVectorOfZeros(IceType_v16i8); 3818 VecReg = makeVectorOfZeros(IceType_v16i8);
3819 Ty = largestTypeInSize(CountValue);
3820 } else if (CountValue <= BytesPerStorei32 * Traits::MEMCPY_UNROLL_LIMIT) {
3821 // When the value is non-zero or the count is small we can't use vector
3822 // instructions so are limited to 32-bit stores.
3823 Base = legalizeToReg(Dest);
3824 constexpr uint32_t MaxSize = 4;
3825 Ty = largestTypeInSize(CountValue, MaxSize);
3826 }
3783 3827
3784 // Too small to use large vector operations so use small ones instead 3828 if (Base) {
3785 if (CountValue < BytesPerStorep) { 3829 uint32_t TyWidth = typeWidthInBytes(Ty);
3786 lowerSet(IceType_i64, 0); 3830
3787 lowerLeftOvers(CountValue - BytesPerStoreq); 3831 uint32_t RemainingBytes = CountValue;
3788 return; 3832 uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
3833 while (RemainingBytes >= TyWidth) {
3834 lowerSet(Ty, Offset);
3835 RemainingBytes -= TyWidth;
3836 Offset -= TyWidth;
3789 } 3837 }
3790 3838
3791 // Use large vector operations 3839 if (RemainingBytes == 0)
3792 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) { 3840 return;
3793 N -= 16;
3794 lowerSet(IceType_v16i8, N);
3795 }
3796 lowerLeftOvers(CountValue & 0xF);
3797 return;
3798 }
3799 3841
3800 // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al? 3842 // Lower the remaining bytes. Adjust to larger types in order to make use
3801 if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) { 3843 // of overlaps in the copies.
3802 Base = legalizeToReg(Dest); 3844 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
3803 // 3 is the awkward size as it is too small for the vector or 32-bit 3845 Offset = CountValue - typeWidthInBytes(LeftOverTy);
3804 // operations and will not work with lowerLeftOvers as there is no valid 3846 lowerSet(LeftOverTy, Offset);
3805 // overlap.
3806 if (CountValue == 3) {
3807 lowerSet(IceType_i16, 0);
3808 lowerSet(IceType_i8, 2);
3809 return;
3810 }
3811
3812 // TODO(ascull); 64-bit can do better with 64-bit mov
3813 for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) {
3814 N -= 4;
3815 lowerSet(IceType_i32, N);
3816 }
3817 lowerLeftOvers(CountValue & 0x3);
3818 return; 3847 return;
3819 } 3848 }
3820 } 3849 }
3821 3850
3822 // Fall back on calling the memset function. The value operand needs to be 3851 // Fall back on calling the memset function. The value operand needs to be
3823 // extended to a stack slot size because the PNaCl ABI requires arguments to 3852 // extended to a stack slot size because the PNaCl ABI requires arguments to
3824 // be at least 32 bits wide. 3853 // be at least 32 bits wide.
3825 Operand *ValExt; 3854 Operand *ValExt;
3826 if (IsValConst) { 3855 if (IsValConst) {
3827 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue); 3856 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
(...skipping 1218 matching lines...) Expand 10 before | Expand all | Expand 10 after
5046 // There aren't any 64-bit integer registers for x86-32. 5075 // There aren't any 64-bit integer registers for x86-32.
5047 assert(Traits::Is64Bit || Type != IceType_i64); 5076 assert(Traits::Is64Bit || Type != IceType_i64);
5048 Variable *Reg = Func->makeVariable(Type); 5077 Variable *Reg = Func->makeVariable(Type);
5049 if (RegNum == Variable::NoRegister) 5078 if (RegNum == Variable::NoRegister)
5050 Reg->setWeightInfinite(); 5079 Reg->setWeightInfinite();
5051 else 5080 else
5052 Reg->setRegNum(RegNum); 5081 Reg->setRegNum(RegNum);
5053 return Reg; 5082 return Reg;
5054 } 5083 }
5055 5084
5085 template <class Machine>
5086 const Type TargetX86Base<Machine>::TypeForSize[] = {
5087 IceType_i8, IceType_i16, IceType_i32,
5088 (Traits::Is64Bit ? IceType_i64 : IceType_f64), IceType_v16i8};
5089 template <class Machine>
5090 Type TargetX86Base<Machine>::largestTypeInSize(uint32_t Size,
5091 uint32_t MaxSize) {
5092 assert(Size != 0);
5093 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
5094 uint32_t MaxIndex = MaxSize == NoSizeLimit
5095 ? llvm::array_lengthof(TypeForSize) - 1
5096 : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
5097 return TypeForSize[std::min(TyIndex, MaxIndex)];
5098 }
5099
5100 template <class Machine>
5101 Type TargetX86Base<Machine>::firstTypeThatFitsSize(uint32_t Size,
5102 uint32_t MaxSize) {
5103 assert(Size != 0);
5104 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
5105 if (!llvm::isPowerOf2_32(Size))
5106 ++TyIndex;
5107 uint32_t MaxIndex = MaxSize == NoSizeLimit
5108 ? llvm::array_lengthof(TypeForSize) - 1
5109 : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
5110 return TypeForSize[std::min(TyIndex, MaxIndex)];
5111 }
5112
5056 template <class Machine> void TargetX86Base<Machine>::postLower() { 5113 template <class Machine> void TargetX86Base<Machine>::postLower() {
5057 if (Ctx->getFlags().getOptLevel() == Opt_m1) 5114 if (Ctx->getFlags().getOptLevel() == Opt_m1)
5058 return; 5115 return;
5059 inferTwoAddress(); 5116 inferTwoAddress();
5060 } 5117 }
5061 5118
5062 template <class Machine> 5119 template <class Machine>
5063 void TargetX86Base<Machine>::makeRandomRegisterPermutation( 5120 void TargetX86Base<Machine>::makeRandomRegisterPermutation(
5064 llvm::SmallVectorImpl<int32_t> &Permutation, 5121 llvm::SmallVectorImpl<int32_t> &Permutation,
5065 const llvm::SmallBitVector &ExcludeRegisters) const { 5122 const llvm::SmallBitVector &ExcludeRegisters) const {
(...skipping 240 matching lines...) Expand 10 before | Expand all | Expand 10 after
5306 } 5363 }
5307 // the offset is not eligible for blinding or pooling, return the original 5364 // the offset is not eligible for blinding or pooling, return the original
5308 // mem operand 5365 // mem operand
5309 return MemOperand; 5366 return MemOperand;
5310 } 5367 }
5311 5368
5312 } // end of namespace X86Internal 5369 } // end of namespace X86Internal
5313 } // end of namespace Ice 5370 } // end of namespace Ice
5314 5371
5315 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H 5372 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
OLDNEW
« no previous file with comments | « src/IceTargetLoweringX86Base.h ('k') | tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698