Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(318)

Side by Side Diff: src/IceTargetLoweringX86BaseImpl.h

Issue 1278173009: Inline memove for small constant sizes and refactor memcpy and memset. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master
Patch Set: r1 == rax Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// 1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==//
2 // 2 //
3 // The Subzero Code Generator 3 // The Subzero Code Generator
4 // 4 //
5 // This file is distributed under the University of Illinois Open Source 5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details. 6 // License. See LICENSE.TXT for details.
7 // 7 //
8 //===----------------------------------------------------------------------===// 8 //===----------------------------------------------------------------------===//
9 /// 9 ///
10 /// \file 10 /// \file
(...skipping 3016 matching lines...) Expand 10 before | Expand all | Expand 10 after
3027 Call->addArg(Instr->getArg(0)); 3027 Call->addArg(Instr->getArg(0));
3028 Call->addArg(Instr->getArg(1)); 3028 Call->addArg(Instr->getArg(1));
3029 lowerCall(Call); 3029 lowerCall(Call);
3030 return; 3030 return;
3031 } 3031 }
3032 case Intrinsics::Memcpy: { 3032 case Intrinsics::Memcpy: {
3033 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); 3033 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
3034 return; 3034 return;
3035 } 3035 }
3036 case Intrinsics::Memmove: { 3036 case Intrinsics::Memmove: {
3037 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3); 3037 lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
3038 Call->addArg(Instr->getArg(0));
3039 Call->addArg(Instr->getArg(1));
3040 Call->addArg(Instr->getArg(2));
3041 lowerCall(Call);
3042 return; 3038 return;
3043 } 3039 }
3044 case Intrinsics::Memset: { 3040 case Intrinsics::Memset: {
3045 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); 3041 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
3046 return; 3042 return;
3047 } 3043 }
3048 case Intrinsics::NaClReadTP: { 3044 case Intrinsics::NaClReadTP: {
3049 if (Ctx->getFlags().getUseSandboxing()) { 3045 if (Ctx->getFlags().getUseSandboxing()) {
3050 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand); 3046 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand);
3051 Variable *Dest = Instr->getDest(); 3047 Variable *Dest = Instr->getDest();
(...skipping 422 matching lines...) Expand 10 before | Expand all | Expand 10 after
3474 _bsr(T_Dest2, SecondVar); 3470 _bsr(T_Dest2, SecondVar);
3475 _xor(T_Dest2, ThirtyOne); 3471 _xor(T_Dest2, ThirtyOne);
3476 } 3472 }
3477 _test(SecondVar, SecondVar); 3473 _test(SecondVar, SecondVar);
3478 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e); 3474 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e);
3479 _mov(DestLo, T_Dest2); 3475 _mov(DestLo, T_Dest2);
3480 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); 3476 _mov(DestHi, Ctx->getConstantZero(IceType_i32));
3481 } 3477 }
3482 3478
3483 template <class Machine> 3479 template <class Machine>
3480 void TargetX86Base<Machine>::typedLoad(Type Ty, Variable *Dest, Variable *Base,
3481 Constant *Offset) {
3482 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
3483
3484 if (isVectorType(Ty))
3485 _movp(Dest, Mem);
3486 else if (Ty == IceType_f64)
3487 _movq(Dest, Mem);
3488 else
3489 _mov(Dest, Mem);
3490 }
3491
3492 template <class Machine>
3493 void TargetX86Base<Machine>::typedStore(Type Ty, Variable *Value,
3494 Variable *Base, Constant *Offset) {
3495 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
3496
3497 if (isVectorType(Ty))
3498 _storep(Value, Mem);
3499 else if (Ty == IceType_f64)
3500 _storeq(Value, Mem);
3501 else
3502 _store(Value, Mem);
3503 }
3504
3505 template <class Machine>
3506 void TargetX86Base<Machine>::copyMemory(Type Ty, Variable *Dest, Variable *Src,
3507 int32_t OffsetAmt) {
3508 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
3509 // TODO(ascull): this or add nullptr test to _movp, _movq
3510 Variable *Data = makeReg(Ty);
3511
3512 typedLoad(Ty, Data, Src, Offset);
3513 typedStore(Ty, Data, Dest, Offset);
3514 }
3515
3516 template <class Machine>
3484 void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src, 3517 void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src,
3485 Operand *Count) { 3518 Operand *Count) {
3486 // There is a load and store for each chunk in the unroll 3519 // There is a load and store for each chunk in the unroll
3487 constexpr uint32_t UNROLL_LIMIT = 8; 3520 constexpr uint32_t UNROLL_LIMIT = 8;
3488 constexpr uint32_t BytesPerStorep = 16; 3521 constexpr uint32_t BytesPerStorep = 16;
3489 constexpr uint32_t BytesPerStoreq = 8;
3490 constexpr uint32_t BytesPerStorei32 = 4;
3491 constexpr uint32_t BytesPerStorei16 = 2;
3492 constexpr uint32_t BytesPerStorei8 = 1;
3493 3522
3494 // Check if the operands are constants 3523 // Check if the operands are constants
3495 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); 3524 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
3496 const bool IsCountConst = CountConst != nullptr; 3525 const bool IsCountConst = CountConst != nullptr;
3497 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; 3526 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
3498 3527
3499 if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) { 3528 if (shouldOptimizeMemIntrins() && IsCountConst &&
3529 CountValue <= BytesPerStorep * UNROLL_LIMIT) {
3500 // Unlikely, but nothing to do if it does happen 3530 // Unlikely, but nothing to do if it does happen
3501 if (CountValue == 0) 3531 if (CountValue == 0)
3502 return; 3532 return;
3503 3533
3504 Variable *SrcBase = legalizeToReg(Src); 3534 Variable *SrcBase = legalizeToReg(Src);
3505 Variable *DestBase = legalizeToReg(Dest); 3535 Variable *DestBase = legalizeToReg(Dest);
3506 3536
3507 auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) { 3537 // Find the largest type that can be used and use it as much as possible in
3508 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; 3538 // reverse order. Then handle any remainder with overlapping copies. Since
3509 // TODO(ascull): this or add nullptr test to _movp, _movq 3539 // the remainder will be at the end, there will be reduced pressure on the
3510 Variable *Data = makeReg(Ty); 3540 // memory unit as the accesses to the same memory are far apart.
3541 Type Ty = largestTypeInSize(CountValue);
3542 uint32_t TyWidth = typeWidthInBytes(Ty);
3511 3543
3512 // TODO(ascull): is 64-bit better with vector or scalar movq? 3544 uint32_t RemainingBytes = CountValue;
3513 auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset); 3545 int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
3514 if (isVectorType(Ty)) 3546 while (RemainingBytes >= TyWidth) {
3515 _movp(Data, SrcMem); 3547 copyMemory(Ty, DestBase, SrcBase, Offset);
3516 else if (Ty == IceType_f64) 3548 RemainingBytes -= TyWidth;
3517 _movq(Data, SrcMem); 3549 Offset -= TyWidth;
3518 else
3519 _mov(Data, SrcMem);
3520
3521 auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset);
3522 if (isVectorType(Ty))
3523 _storep(Data, DestMem);
3524 else if (Ty == IceType_f64)
3525 _storeq(Data, DestMem);
3526 else
3527 _store(Data, DestMem);
3528 };
3529
3530 // Lowers the assignment to the remaining bytes. Assumes the original size
3531 // was large enough to allow for overlaps.
3532 auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) {
3533 if (Size > BytesPerStoreq) {
3534 lowerCopy(IceType_v16i8, CountValue - BytesPerStorep);
3535 } else if (Size > BytesPerStorei32) {
3536 lowerCopy(IceType_f64, CountValue - BytesPerStoreq);
3537 } else if (Size > BytesPerStorei16) {
3538 lowerCopy(IceType_i32, CountValue - BytesPerStorei32);
3539 } else if (Size > BytesPerStorei8) {
3540 lowerCopy(IceType_i16, CountValue - BytesPerStorei16);
3541 } else if (Size == BytesPerStorei8) {
3542 lowerCopy(IceType_i8, CountValue - BytesPerStorei8);
3543 }
3544 };
3545
3546 if (CountValue >= BytesPerStorep) {
3547 // Use large vector operations
3548 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {
3549 N -= BytesPerStorep;
3550 lowerCopy(IceType_v16i8, N);
3551 }
3552 lowerLeftOvers(CountValue & 0xF);
3553 return;
3554 } 3550 }
3555 3551
3556 // Too small to use large vector operations so use small ones instead 3552 if (RemainingBytes == 0)
3557 if (CountValue >= BytesPerStoreq) {
3558 lowerCopy(IceType_f64, 0);
3559 lowerLeftOvers(CountValue - BytesPerStoreq);
3560 return; 3553 return;
3561 }
3562 3554
3563 // Too small for vector operations so use scalar ones 3555 // Lower the remaining bytes. Adjust to larger types in order to make use
3564 if (CountValue >= BytesPerStorei32) { 3556 // of overlaps in the copies.
3565 lowerCopy(IceType_i32, 0); 3557 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
3566 lowerLeftOvers(CountValue - BytesPerStorei32); 3558 Offset = CountValue - typeWidthInBytes(LeftOverTy);
3567 return; 3559 copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
3568 }
3569
3570 // 3 is the awkward size as it is too small for the vector or 32-bit
3571 // operations and will not work with lowerLeftOvers as there is no valid
3572 // overlap.
3573 if (CountValue == 3) {
3574 lowerCopy(IceType_i16, 0);
3575 lowerCopy(IceType_i8, 2);
3576 return;
3577 }
3578
3579 // 1 or 2 can be done in a single scalar copy
3580 lowerLeftOvers(CountValue);
3581 return; 3560 return;
3582 } 3561 }
3583 3562
3584 // Fall back on a function call 3563 // Fall back on a function call
3585 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3); 3564 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);
3586 Call->addArg(Dest); 3565 Call->addArg(Dest);
3587 Call->addArg(Src); 3566 Call->addArg(Src);
3588 Call->addArg(Count); 3567 Call->addArg(Count);
3568 lowerCall(Call);
3569 }
3570
3571 template <class Machine>
3572 void TargetX86Base<Machine>::lowerMemmove(Operand *Dest, Operand *Src,
3573 Operand *Count) {
3574 // There is a load and store for each chunk in the unroll
3575 constexpr uint32_t UNROLL_LIMIT = 8; // 32-bit has 8 xmm registers
jvoung (off chromium) 2015/08/20 15:42:53 I'm not sure the exact side effects on register al
Jim Stichnoth 2015/08/20 16:57:04 Could also get the limit from the Machine traits o
ascull 2015/08/20 18:41:56 I put it in traits and left the value as is for no
3576 constexpr uint32_t BytesPerStorep = 16;
3577
3578 // Check if the operands are constants
3579 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
3580 const bool IsCountConst = CountConst != nullptr;
3581 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
3582
3583 if (shouldOptimizeMemIntrins() && IsCountConst &&
3584 CountValue <= BytesPerStorep * UNROLL_LIMIT) {
3585 // Unlikely, but nothing to do if it does happen
3586 if (CountValue == 0)
3587 return;
3588
3589 Variable *SrcBase = legalizeToReg(Src);
3590 Variable *DestBase = legalizeToReg(Dest);
3591
3592 std::tuple<Type, Constant *, Variable *> Moves[UNROLL_LIMIT];
3593 Constant *Offset;
3594 Variable *Reg;
3595
3596 // Copy the data into registers as the source and destination could overlap
3597 // so make sure not to clobber the memory. This also means overlapping moves
3598 // can be used as we are taking a safe snapshot of the memory.
3599 Type Ty = largestTypeInSize(CountValue);
3600 uint32_t TyWidth = typeWidthInBytes(Ty);
3601
3602 uint32_t RemainingBytes = CountValue;
3603 int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
3604 size_t N = 0;
3605 while (RemainingBytes >= TyWidth) {
3606 assert(N <= UNROLL_LIMIT);
3607 Offset = Ctx->getConstantInt32(OffsetAmt);
3608 Reg = makeReg(Ty);
3609 typedLoad(Ty, Reg, SrcBase, Offset);
3610 RemainingBytes -= TyWidth;
3611 OffsetAmt -= TyWidth;
3612 Moves[N++] = std::make_tuple(Ty, Offset, Reg);
3613 }
3614
3615 if (RemainingBytes != 0) {
3616 // Lower the remaining bytes. Adjust to larger types in order to make use
3617 // of overlaps in the copies.
3618 assert(N <= UNROLL_LIMIT);
3619 Ty = firstTypeThatFitsSize(RemainingBytes);
3620 Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
3621 Reg = makeReg(Ty);
3622 typedLoad(Ty, Reg, SrcBase, Offset);
3623 Moves[N++] = std::make_tuple(Ty, Offset, Reg);
3624 }
3625
3626 // Copy the data out into the destination memory
3627 for (size_t i = 0; i < N; ++i) {
3628 std::tie(Ty, Offset, Reg) = Moves[i];
3629 typedStore(Ty, Reg, DestBase, Offset);
3630 }
3631
3632 return;
3633 }
3634
3635 // Fall back on a function call
3636 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);
3637 Call->addArg(Dest);
3638 Call->addArg(Src);
3639 Call->addArg(Count);
3589 lowerCall(Call); 3640 lowerCall(Call);
3590 } 3641 }
3591 3642
3592 template <class Machine> 3643 template <class Machine>
3593 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, 3644 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,
3594 Operand *Count) { 3645 Operand *Count) {
3595 constexpr uint32_t UNROLL_LIMIT = 16; 3646 constexpr uint32_t UNROLL_LIMIT = 16;
3596 constexpr uint32_t BytesPerStorep = 16; 3647 constexpr uint32_t BytesPerStorep = 16;
3597 constexpr uint32_t BytesPerStoreq = 8; 3648 constexpr uint32_t BytesPerStoreq = 8;
3598 constexpr uint32_t BytesPerStorei32 = 4; 3649 constexpr uint32_t BytesPerStorei32 = 4;
3599 constexpr uint32_t BytesPerStorei16 = 2;
3600 constexpr uint32_t BytesPerStorei8 = 1;
3601 assert(Val->getType() == IceType_i8); 3650 assert(Val->getType() == IceType_i8);
3602 3651
3603 // Check if the operands are constants 3652 // Check if the operands are constants
3604 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); 3653 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
3605 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val); 3654 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
3606 const bool IsCountConst = CountConst != nullptr; 3655 const bool IsCountConst = CountConst != nullptr;
3607 const bool IsValConst = ValConst != nullptr; 3656 const bool IsValConst = ValConst != nullptr;
3608 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; 3657 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
3609 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0; 3658 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
3610 3659
3611 // Unlikely, but nothing to do if it does happen 3660 // Unlikely, but nothing to do if it does happen
3612 if (IsCountConst && CountValue == 0) 3661 if (IsCountConst && CountValue == 0)
3613 return; 3662 return;
3614 3663
3615 // TODO(ascull): if the count is constant but val is not it would be possible 3664 // TODO(ascull): if the count is constant but val is not it would be possible
3616 // to inline by spreading the value across 4 bytes and accessing subregs e.g. 3665 // to inline by spreading the value across 4 bytes and accessing subregs e.g.
3617 // eax, ax and al. 3666 // eax, ax and al.
3618 if (IsCountConst && IsValConst) { 3667 if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {
3619 Variable *Base = nullptr; 3668 Variable *Base = nullptr;
3669 Variable *VecReg = nullptr;
3620 const uint32_t SpreadValue = 3670 const uint32_t SpreadValue =
3621 (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue; 3671 (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue;
3622 Variable *VecReg = nullptr;
3623 3672
3624 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty, 3673 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
3625 uint32_t OffsetAmt) { 3674 uint32_t OffsetAmt) {
3626 assert(Base != nullptr); 3675 assert(Base != nullptr);
3627 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; 3676 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
3628 3677
3629 // TODO(ascull): is 64-bit better with vector or scalar movq? 3678 // TODO(ascull): is 64-bit better with vector or scalar movq?
3630 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); 3679 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
3631 if (isVectorType(Ty)) { 3680 if (isVectorType(Ty)) {
3632 assert(VecReg != nullptr); 3681 assert(VecReg != nullptr);
3633 _storep(VecReg, Mem); 3682 _storep(VecReg, Mem);
3634 } else if (Ty == IceType_i64) { 3683 } else if (Ty == IceType_f64) {
3635 assert(VecReg != nullptr); 3684 assert(VecReg != nullptr);
3636 _storeq(VecReg, Mem); 3685 _storeq(VecReg, Mem);
3637 } else { 3686 } else {
3638 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem); 3687 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
3639 } 3688 }
3640 }; 3689 };
3641 3690
3642 // Lowers the assignment to the remaining bytes. Assumes the original size 3691 // Find the largest type that can be used and use it as much as possible in
3643 // was large enough to allow for overlaps. 3692 // reverse order. Then handle any remainder with overlapping copies. Since
3644 auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) { 3693 // the remainder will be at the end, there will be reduces pressure on the
3645 if (Size > BytesPerStoreq) { 3694 // memory unit as the access to the same memory are far apart.
3646 lowerSet(IceType_v16i8, CountValue - BytesPerStorep); 3695 Type Ty;
3647 } else if (Size > BytesPerStorei32) {
3648 lowerSet(IceType_i64, CountValue - BytesPerStoreq);
3649 } else if (Size > BytesPerStorei16) {
3650 lowerSet(IceType_i32, CountValue - BytesPerStorei32);
3651 } else if (Size > BytesPerStorei8) {
3652 lowerSet(IceType_i16, CountValue - BytesPerStorei16);
3653 } else if (Size == BytesPerStorei8) {
3654 lowerSet(IceType_i8, CountValue - BytesPerStorei8);
3655 }
3656 };
3657
3658 // When the value is zero it can be loaded into a vector register cheaply
3659 // using the xor trick.
3660 if (ValValue == 0 && CountValue >= BytesPerStoreq && 3696 if (ValValue == 0 && CountValue >= BytesPerStoreq &&
3661 CountValue <= BytesPerStorep * UNROLL_LIMIT) { 3697 CountValue <= BytesPerStorep * UNROLL_LIMIT) {
3698 // When the value is zero it can be loaded into a vector register cheaply
3699 // using the xor trick.
3662 Base = legalizeToReg(Dest); 3700 Base = legalizeToReg(Dest);
3663 VecReg = makeVectorOfZeros(IceType_v16i8); 3701 VecReg = makeVectorOfZeros(IceType_v16i8);
3702 Ty = largestTypeInSize(CountValue);
3703 } else if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) {
3704 // When the value is non-zero or the count is small we can't use vector
3705 // instructions so are limited to 32-bit stores.
3706 Base = legalizeToReg(Dest);
3707 constexpr uint32_t MaxSize = 4;
3708 Ty = largestTypeInSize(CountValue, MaxSize);
3709 }
3664 3710
3665 // Too small to use large vector operations so use small ones instead 3711 if (Base) {
3666 if (CountValue < BytesPerStorep) { 3712 uint32_t TyWidth = typeWidthInBytes(Ty);
3667 lowerSet(IceType_i64, 0); 3713
3668 lowerLeftOvers(CountValue - BytesPerStoreq); 3714 uint32_t RemainingBytes = CountValue;
3669 return; 3715 uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
3716 while (RemainingBytes >= TyWidth) {
3717 lowerSet(Ty, Offset);
3718 RemainingBytes -= TyWidth;
3719 Offset -= TyWidth;
3670 } 3720 }
3671 3721
3672 // Use large vector operations 3722 if (RemainingBytes == 0)
3673 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) { 3723 return;
3674 N -= 16;
3675 lowerSet(IceType_v16i8, N);
3676 }
3677 lowerLeftOvers(CountValue & 0xF);
3678 return;
3679 }
3680 3724
3681 // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al? 3725 // Lower the remaining bytes. Adjust to larger types in order to make use
3682 if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) { 3726 // of overlaps in the copies.
3683 Base = legalizeToReg(Dest); 3727 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
3684 // 3 is the awkward size as it is too small for the vector or 32-bit 3728 Offset = CountValue - typeWidthInBytes(LeftOverTy);
3685 // operations and will not work with lowerLeftOvers as there is no valid 3729 lowerSet(LeftOverTy, Offset);
3686 // overlap.
3687 if (CountValue == 3) {
3688 lowerSet(IceType_i16, 0);
3689 lowerSet(IceType_i8, 2);
3690 return;
3691 }
3692
3693 // TODO(ascull); 64-bit can do better with 64-bit mov
3694 for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) {
3695 N -= 4;
3696 lowerSet(IceType_i32, N);
3697 }
3698 lowerLeftOvers(CountValue & 0x3);
3699 return; 3730 return;
3700 } 3731 }
3701 } 3732 }
3702 3733
3703 // Fall back on calling the memset function. The value operand needs to be 3734 // Fall back on calling the memset function. The value operand needs to be
3704 // extended to a stack slot size because the PNaCl ABI requires arguments to 3735 // extended to a stack slot size because the PNaCl ABI requires arguments to
3705 // be at least 32 bits wide. 3736 // be at least 32 bits wide.
3706 Operand *ValExt; 3737 Operand *ValExt;
3707 if (IsValConst) { 3738 if (IsValConst) {
3708 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue); 3739 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
(...skipping 1200 matching lines...) Expand 10 before | Expand all | Expand 10 after
4909 // There aren't any 64-bit integer registers for x86-32. 4940 // There aren't any 64-bit integer registers for x86-32.
4910 assert(Type != IceType_i64); 4941 assert(Type != IceType_i64);
4911 Variable *Reg = Func->makeVariable(Type); 4942 Variable *Reg = Func->makeVariable(Type);
4912 if (RegNum == Variable::NoRegister) 4943 if (RegNum == Variable::NoRegister)
4913 Reg->setWeightInfinite(); 4944 Reg->setWeightInfinite();
4914 else 4945 else
4915 Reg->setRegNum(RegNum); 4946 Reg->setRegNum(RegNum);
4916 return Reg; 4947 return Reg;
4917 } 4948 }
4918 4949
4950 template <class Machine>
4951 const Type TargetX86Base<Machine>::TypeForSize[] = {
4952 IceType_i8, IceType_i16, IceType_i32,
4953 (Traits::Is64Bit ? IceType_i64 : IceType_f64), IceType_v16i8};
4954 template <class Machine>
4955 Type TargetX86Base<Machine>::largestTypeInSize(uint32_t Size,
4956 uint32_t MaxSize) {
4957 assert(Size != 0);
4958 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
4959 uint32_t MaxIndex = MaxSize != NoSizeLimit
Jim Stichnoth 2015/08/20 16:57:04 I think it's generally easier to read if you minim
ascull 2015/08/20 18:41:56 Done.
4960 ? llvm::findLastSet(MaxSize, llvm::ZB_Undefined)
4961 : llvm::array_lengthof(TypeForSize) - 1;
4962 return TypeForSize[std::min(TyIndex, MaxIndex)];
4963 }
4964
4965 template <class Machine>
4966 Type TargetX86Base<Machine>::firstTypeThatFitsSize(uint32_t Size,
4967 uint32_t MaxSize) {
4968 assert(Size != 0);
4969 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
4970 if (!llvm::isPowerOf2_32(Size))
4971 ++TyIndex;
4972 uint32_t MaxIndex = MaxSize != NoSizeLimit
4973 ? llvm::findLastSet(MaxSize, llvm::ZB_Undefined)
4974 : llvm::array_lengthof(TypeForSize) - 1;
4975 return TypeForSize[std::min(TyIndex, MaxIndex)];
4976 }
4977
4919 template <class Machine> void TargetX86Base<Machine>::postLower() { 4978 template <class Machine> void TargetX86Base<Machine>::postLower() {
4920 if (Ctx->getFlags().getOptLevel() == Opt_m1) 4979 if (Ctx->getFlags().getOptLevel() == Opt_m1)
4921 return; 4980 return;
4922 inferTwoAddress(); 4981 inferTwoAddress();
4923 } 4982 }
4924 4983
4925 template <class Machine> 4984 template <class Machine>
4926 void TargetX86Base<Machine>::makeRandomRegisterPermutation( 4985 void TargetX86Base<Machine>::makeRandomRegisterPermutation(
4927 llvm::SmallVectorImpl<int32_t> &Permutation, 4986 llvm::SmallVectorImpl<int32_t> &Permutation,
4928 const llvm::SmallBitVector &ExcludeRegisters) const { 4987 const llvm::SmallBitVector &ExcludeRegisters) const {
(...skipping 233 matching lines...) Expand 10 before | Expand all | Expand 10 after
5162 } 5221 }
5163 // the offset is not eligible for blinding or pooling, return the original 5222 // the offset is not eligible for blinding or pooling, return the original
5164 // mem operand 5223 // mem operand
5165 return MemOperand; 5224 return MemOperand;
5166 } 5225 }
5167 5226
5168 } // end of namespace X86Internal 5227 } // end of namespace X86Internal
5169 } // end of namespace Ice 5228 } // end of namespace Ice
5170 5229
5171 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H 5230 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698