Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(4)

Side by Side Diff: src/IceTargetLoweringX86BaseImpl.h

Issue 1278173009: Inline memove for small constant sizes and refactor memcpy and memset. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master
Patch Set: Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// 1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==//
2 // 2 //
3 // The Subzero Code Generator 3 // The Subzero Code Generator
4 // 4 //
5 // This file is distributed under the University of Illinois Open Source 5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details. 6 // License. See LICENSE.TXT for details.
7 // 7 //
8 //===----------------------------------------------------------------------===// 8 //===----------------------------------------------------------------------===//
9 /// 9 ///
10 /// \file 10 /// \file
(...skipping 3016 matching lines...) Expand 10 before | Expand all | Expand 10 after
3027 Call->addArg(Instr->getArg(0)); 3027 Call->addArg(Instr->getArg(0));
3028 Call->addArg(Instr->getArg(1)); 3028 Call->addArg(Instr->getArg(1));
3029 lowerCall(Call); 3029 lowerCall(Call);
3030 return; 3030 return;
3031 } 3031 }
3032 case Intrinsics::Memcpy: { 3032 case Intrinsics::Memcpy: {
3033 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); 3033 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
3034 return; 3034 return;
3035 } 3035 }
3036 case Intrinsics::Memmove: { 3036 case Intrinsics::Memmove: {
3037 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3); 3037 lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
3038 Call->addArg(Instr->getArg(0));
3039 Call->addArg(Instr->getArg(1));
3040 Call->addArg(Instr->getArg(2));
3041 lowerCall(Call);
3042 return; 3038 return;
3043 } 3039 }
3044 case Intrinsics::Memset: { 3040 case Intrinsics::Memset: {
3045 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); 3041 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
3046 return; 3042 return;
3047 } 3043 }
3048 case Intrinsics::NaClReadTP: { 3044 case Intrinsics::NaClReadTP: {
3049 if (Ctx->getFlags().getUseSandboxing()) { 3045 if (Ctx->getFlags().getUseSandboxing()) {
3050 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand); 3046 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand);
3051 Variable *Dest = Instr->getDest(); 3047 Variable *Dest = Instr->getDest();
(...skipping 422 matching lines...) Expand 10 before | Expand all | Expand 10 after
3474 _bsr(T_Dest2, SecondVar); 3470 _bsr(T_Dest2, SecondVar);
3475 _xor(T_Dest2, ThirtyOne); 3471 _xor(T_Dest2, ThirtyOne);
3476 } 3472 }
3477 _test(SecondVar, SecondVar); 3473 _test(SecondVar, SecondVar);
3478 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e); 3474 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e);
3479 _mov(DestLo, T_Dest2); 3475 _mov(DestLo, T_Dest2);
3480 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); 3476 _mov(DestHi, Ctx->getConstantZero(IceType_i32));
3481 } 3477 }
3482 3478
3483 template <class Machine> 3479 template <class Machine>
3480 Type TargetX86Base<Machine>::typeForSize(uint32_t Size, bool Overflow,
Jim Stichnoth 2015/08/12 13:41:47 Is there anything machine-specific about this func
ascull 2015/08/17 22:18:52 Possibly, as we discussed.
3481 uint32_t LimitWidth) {
3482 assert(Size != 0);
3483 static const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32,
3484 IceType_f64, IceType_v16i8};
John 2015/08/12 17:44:57 See if Traits::Is64Bit ? IceType_i64 : IceType_f6
ascull 2015/08/17 22:18:53 That does work. And it sorts on of the TODOs, than
3485 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
3486 if (Overflow && !llvm::isPowerOf2_32(Size))
3487 ++TyIndex;
3488 uint32_t MaxIndex =
3489 LimitWidth ? llvm::findLastSet(LimitWidth, llvm::ZB_Undefined) : 4;
Jim Stichnoth 2015/08/12 13:41:47 Can you use llvm::array_lengthof(TypeForSize)-1 in
ascull 2015/08/17 22:18:52 This is the function I tried and failed to find, T
3490 return TypeForSize[std::min(TyIndex, MaxIndex)];
3491 }
3492
3493 template <class Machine>
3494 void TargetX86Base<Machine>::lowerCopyMem(Type Ty, Variable *Dest,
3495 Variable *Src, uint32_t OffsetAmt) {
3496 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
3497 // TODO(ascull): this or add nullptr test to _movp, _movq
3498 Variable *Data = makeReg(Ty);
3499
3500 // TODO(ascull): is 64-bit better with vector or scalar movq?
3501 auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, Src, Offset);
3502 if (isVectorType(Ty))
3503 _movp(Data, SrcMem);
3504 else if (Ty == IceType_f64)
3505 _movq(Data, SrcMem);
3506 else
3507 _mov(Data, SrcMem);
3508
3509 auto *DestMem = Traits::X86OperandMem::create(Func, Ty, Dest, Offset);
3510 if (isVectorType(Ty))
3511 _storep(Data, DestMem);
John 2015/08/12 17:44:57 just curious: why store[pq]\? instead of mov? you
ascull 2015/08/17 22:18:53 _store seems to be needed for writing to memory. I
3512 else if (Ty == IceType_f64)
3513 _storeq(Data, DestMem);
3514 else
3515 _store(Data, DestMem);
3516 }
3517
3518 template <class Machine>
3484 void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src, 3519 void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src,
3485 Operand *Count) { 3520 Operand *Count) {
3486 // There is a load and store for each chunk in the unroll 3521 // There is a load and store for each chunk in the unroll
3487 constexpr uint32_t UNROLL_LIMIT = 8; 3522 constexpr uint32_t UNROLL_LIMIT = 8;
3488 constexpr uint32_t BytesPerStorep = 16; 3523 constexpr uint32_t BytesPerStorep = 16;
3489 constexpr uint32_t BytesPerStoreq = 8;
3490 constexpr uint32_t BytesPerStorei32 = 4;
3491 constexpr uint32_t BytesPerStorei16 = 2;
3492 constexpr uint32_t BytesPerStorei8 = 1;
3493 3524
3494 // Check if the operands are constants 3525 // Check if the operands are constants
3495 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); 3526 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
3496 const bool IsCountConst = CountConst != nullptr; 3527 const bool IsCountConst = CountConst != nullptr;
3497 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; 3528 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
3498 3529
3499 if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) { 3530 if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) {
3500 // Unlikely, but nothing to do if it does happen 3531 // Unlikely, but nothing to do if it does happen
3501 if (CountValue == 0) 3532 if (CountValue == 0)
3502 return; 3533 return;
3503 3534
3504 Variable *SrcBase = legalizeToReg(Src); 3535 Variable *SrcBase = legalizeToReg(Src);
3505 Variable *DestBase = legalizeToReg(Dest); 3536 Variable *DestBase = legalizeToReg(Dest);
3506 3537
3507 auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) { 3538 // Find the largest type that can be used and use it as much as possible in
3508 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; 3539 // reverse order. Then handle any remainder with overlapping copies. Since
3509 // TODO(ascull): this or add nullptr test to _movp, _movq 3540 // the remainder will be at the end, there will be reduces pressure on the
Jim Stichnoth 2015/08/12 13:41:47 reduced (I think)
ascull 2015/08/17 22:18:53 Done.
3510 Variable *Data = makeReg(Ty); 3541 // memory unit as the access to the same memory are far apart.
Jim Stichnoth 2015/08/12 13:41:47 accesses ?
ascull 2015/08/17 22:18:53 Done.
3542 Type Ty = typeForSize(CountValue);
3543 uint32_t TyWidth = typeWidthInBytes(Ty);
3511 3544
3512 // TODO(ascull): is 64-bit better with vector or scalar movq? 3545 uint32_t RemainingBytes = CountValue;
John 2015/08/12 17:44:57 do you really need uint32_t's here? unless you rea
ascull 2015/08/17 22:18:53 The CountValue comes from a size_t so is unsigned.
3513 auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset); 3546 uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
3514 if (isVectorType(Ty)) 3547 while (RemainingBytes >= TyWidth) {
3515 _movp(Data, SrcMem); 3548 lowerCopyMem(Ty, DestBase, SrcBase, Offset);
3516 else if (Ty == IceType_f64) 3549 RemainingBytes -= TyWidth;
3517 _movq(Data, SrcMem); 3550 Offset -= TyWidth;
3518 else
3519 _mov(Data, SrcMem);
3520
3521 auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset);
3522 if (isVectorType(Ty))
3523 _storep(Data, DestMem);
3524 else if (Ty == IceType_f64)
3525 _storeq(Data, DestMem);
3526 else
3527 _store(Data, DestMem);
3528 };
3529
3530 // Lowers the assignment to the remaining bytes. Assumes the original size
3531 // was large enough to allow for overlaps.
3532 auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) {
3533 if (Size > BytesPerStoreq) {
3534 lowerCopy(IceType_v16i8, CountValue - BytesPerStorep);
3535 } else if (Size > BytesPerStorei32) {
3536 lowerCopy(IceType_f64, CountValue - BytesPerStoreq);
3537 } else if (Size > BytesPerStorei16) {
3538 lowerCopy(IceType_i32, CountValue - BytesPerStorei32);
3539 } else if (Size > BytesPerStorei8) {
3540 lowerCopy(IceType_i16, CountValue - BytesPerStorei16);
3541 } else if (Size == BytesPerStorei8) {
3542 lowerCopy(IceType_i8, CountValue - BytesPerStorei8);
3543 }
3544 };
3545
3546 if (CountValue >= BytesPerStorep) {
3547 // Use large vector operations
3548 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {
3549 N -= BytesPerStorep;
3550 lowerCopy(IceType_v16i8, N);
3551 }
3552 lowerLeftOvers(CountValue & 0xF);
3553 return;
3554 } 3551 }
3555 3552
3556 // Too small to use large vector operations so use small ones instead 3553 if (RemainingBytes == 0)
3557 if (CountValue >= BytesPerStoreq) {
3558 lowerCopy(IceType_f64, 0);
3559 lowerLeftOvers(CountValue - BytesPerStoreq);
3560 return; 3554 return;
3561 }
3562 3555
3563 // Too small for vector operations so use scalar ones 3556 // Lower the remaining bytes. Adjust to larger types in order to make use
3564 if (CountValue >= BytesPerStorei32) { 3557 // of overlaps in the copies.
3565 lowerCopy(IceType_i32, 0); 3558 constexpr bool Overflow = true;
3566 lowerLeftOvers(CountValue - BytesPerStorei32); 3559 Type LeftOverTy = typeForSize(RemainingBytes, Overflow);
3567 return; 3560 Offset = CountValue - typeWidthInBytes(LeftOverTy);
John 2015/08/12 17:44:57 be careful with unaligned memory accesses. it migh
ascull 2015/08/17 22:18:53 This is the way gcc and clang do it. Since the ali
3568 } 3561 lowerCopyMem(LeftOverTy, DestBase, SrcBase, Offset);
3569
3570 // 3 is the awkward size as it is too small for the vector or 32-bit
3571 // operations and will not work with lowerLeftOvers as there is no valid
3572 // overlap.
3573 if (CountValue == 3) {
3574 lowerCopy(IceType_i16, 0);
3575 lowerCopy(IceType_i8, 2);
3576 return;
3577 }
3578
3579 // 1 or 2 can be done in a single scalar copy
3580 lowerLeftOvers(CountValue);
3581 return; 3562 return;
3582 } 3563 }
3583 3564
3584 // Fall back on a function call 3565 // Fall back on a function call
3585 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3); 3566 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);
3586 Call->addArg(Dest); 3567 Call->addArg(Dest);
3587 Call->addArg(Src); 3568 Call->addArg(Src);
3588 Call->addArg(Count); 3569 Call->addArg(Count);
3570 lowerCall(Call);
3571 }
3572
3573 template <class Machine>
3574 void TargetX86Base<Machine>::lowerMemmove(Operand *Dest, Operand *Src,
3575 Operand *Count) {
3576 // There is a load and store for each chunk in the unroll
3577 constexpr uint32_t UNROLL_LIMIT = 8;
3578 constexpr uint32_t BytesPerStorep = 16;
3579
3580 // Check if the operands are constants
3581 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
3582 const bool IsCountConst = CountConst != nullptr;
3583 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
3584
3585 if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) {
3586 // Unlikely, but nothing to do if it does happen
3587 if (CountValue == 0)
3588 return;
3589
3590 Variable *SrcBase = legalizeToReg(Src);
3591 Variable *DestBase = legalizeToReg(Dest);
3592
3593 // Make sure to only copy down to avoid overlap issues. This means Src must
3594 // have a greater address than Dest.
3595 _cmp(SrcBase, DestBase);
3596 typename Traits::Insts::Label *Label =
3597 Traits::Insts::Label::create(Func, this);
3598 _br(Traits::Cond::Br_b, Label);
3599 _xchg(SrcBase, DestBase);
3600 Context.insert(Label);
3601
3602 // We can't assume overlapping copies are safe as the copy may overwrite
3603 // the source. This means we need to repeatedly select the largest size to
3604 // copy that doesn't cause overlap.
3605 uint32_t Size = CountValue;
3606 uint32_t Offset = 0;
3607 while (Size > 0) {
3608 Type Ty = typeForSize(Size);
3609 lowerCopyMem(Ty, DestBase, SrcBase, Offset);
3610 Size -= typeWidthInBytes(Ty);
3611 Offset += typeWidthInBytes(Ty);
3612 }
3613
3614 return;
3615 }
3616
3617 // Fall back on a function call
3618 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);
3619 Call->addArg(Dest);
3620 Call->addArg(Src);
3621 Call->addArg(Count);
3589 lowerCall(Call); 3622 lowerCall(Call);
3590 } 3623 }
3591 3624
3592 template <class Machine> 3625 template <class Machine>
3593 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, 3626 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,
3594 Operand *Count) { 3627 Operand *Count) {
3595 constexpr uint32_t UNROLL_LIMIT = 16; 3628 constexpr uint32_t UNROLL_LIMIT = 16;
3596 constexpr uint32_t BytesPerStorep = 16; 3629 constexpr uint32_t BytesPerStorep = 16;
3597 constexpr uint32_t BytesPerStoreq = 8; 3630 constexpr uint32_t BytesPerStoreq = 8;
3598 constexpr uint32_t BytesPerStorei32 = 4; 3631 constexpr uint32_t BytesPerStorei32 = 4;
3599 constexpr uint32_t BytesPerStorei16 = 2;
3600 constexpr uint32_t BytesPerStorei8 = 1;
3601 assert(Val->getType() == IceType_i8); 3632 assert(Val->getType() == IceType_i8);
3602 3633
3603 // Check if the operands are constants 3634 // Check if the operands are constants
3604 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); 3635 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
3605 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val); 3636 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
3606 const bool IsCountConst = CountConst != nullptr; 3637 const bool IsCountConst = CountConst != nullptr;
3607 const bool IsValConst = ValConst != nullptr; 3638 const bool IsValConst = ValConst != nullptr;
3608 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; 3639 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
3609 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0; 3640 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
3610 3641
3611 // Unlikely, but nothing to do if it does happen 3642 // Unlikely, but nothing to do if it does happen
3612 if (IsCountConst && CountValue == 0) 3643 if (IsCountConst && CountValue == 0)
3613 return; 3644 return;
3614 3645
3615 // TODO(ascull): if the count is constant but val is not it would be possible 3646 // TODO(ascull): if the count is constant but val is not it would be possible
3616 // to inline by spreading the value across 4 bytes and accessing subregs e.g. 3647 // to inline by spreading the value across 4 bytes and accessing subregs e.g.
3617 // eax, ax and al. 3648 // eax, ax and al.
3618 if (IsCountConst && IsValConst) { 3649 if (IsCountConst && IsValConst) {
3619 Variable *Base = nullptr; 3650 Variable *Base = nullptr;
3651 Variable *VecReg = nullptr;
3620 const uint32_t SpreadValue = 3652 const uint32_t SpreadValue =
3621 (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue; 3653 (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue;
3622 Variable *VecReg = nullptr;
3623 3654
3624 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty, 3655 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
3625 uint32_t OffsetAmt) { 3656 uint32_t OffsetAmt) {
3626 assert(Base != nullptr); 3657 assert(Base != nullptr);
3627 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; 3658 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
3628 3659
3629 // TODO(ascull): is 64-bit better with vector or scalar movq? 3660 // TODO(ascull): is 64-bit better with vector or scalar movq?
3630 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); 3661 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
3631 if (isVectorType(Ty)) { 3662 if (isVectorType(Ty)) {
3632 assert(VecReg != nullptr); 3663 assert(VecReg != nullptr);
3633 _storep(VecReg, Mem); 3664 _storep(VecReg, Mem);
3634 } else if (Ty == IceType_i64) { 3665 } else if (Ty == IceType_f64) {
3635 assert(VecReg != nullptr); 3666 assert(VecReg != nullptr);
3636 _storeq(VecReg, Mem); 3667 _storeq(VecReg, Mem);
3637 } else { 3668 } else {
3638 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem); 3669 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
3639 } 3670 }
3640 }; 3671 };
3641 3672
3642 // Lowers the assignment to the remaining bytes. Assumes the original size 3673 // Find the largest type that can be used and use it as much as possible in
3643 // was large enough to allow for overlaps. 3674 // reverse order. Then handle any remainder with overlapping copies. Since
3644 auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) { 3675 // the remainder will be at the end, there will be reduces pressure on the
3645 if (Size > BytesPerStoreq) { 3676 // memory unit as the access to the same memory are far apart.
3646 lowerSet(IceType_v16i8, CountValue - BytesPerStorep); 3677 Type Ty;
3647 } else if (Size > BytesPerStorei32) {
3648 lowerSet(IceType_i64, CountValue - BytesPerStoreq);
3649 } else if (Size > BytesPerStorei16) {
3650 lowerSet(IceType_i32, CountValue - BytesPerStorei32);
3651 } else if (Size > BytesPerStorei8) {
3652 lowerSet(IceType_i16, CountValue - BytesPerStorei16);
3653 } else if (Size == BytesPerStorei8) {
3654 lowerSet(IceType_i8, CountValue - BytesPerStorei8);
3655 }
3656 };
3657
3658 // When the value is zero it can be loaded into a vector register cheaply
3659 // using the xor trick.
3660 if (ValValue == 0 && CountValue >= BytesPerStoreq && 3678 if (ValValue == 0 && CountValue >= BytesPerStoreq &&
3661 CountValue <= BytesPerStorep * UNROLL_LIMIT) { 3679 CountValue <= BytesPerStorep * UNROLL_LIMIT) {
3680 // When the value is zero it can be loaded into a vector register cheaply
3681 // using the xor trick.
3662 Base = legalizeToReg(Dest); 3682 Base = legalizeToReg(Dest);
3663 VecReg = makeVectorOfZeros(IceType_v16i8); 3683 VecReg = makeVectorOfZeros(IceType_v16i8);
3684 Ty = typeForSize(CountValue);
3685 } else if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) {
3686 // When the value is non-zero or the count is small we can't use vector
3687 // instructions so are limits to 32-bit stores.
3688 Base = legalizeToReg(Dest);
3689 constexpr bool Offset = false;
3690 constexpr uint32_t LimitWidth = 4;
3691 Ty = typeForSize(CountValue, Offset, LimitWidth);
3692 }
3664 3693
3665 // Too small to use large vector operations so use small ones instead 3694 if (Base) {
3666 if (CountValue < BytesPerStorep) { 3695 uint32_t TyWidth = typeWidthInBytes(Ty);
3667 lowerSet(IceType_i64, 0); 3696
3668 lowerLeftOvers(CountValue - BytesPerStoreq); 3697 uint32_t RemainingBytes = CountValue;
3669 return; 3698 uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
3699 while (RemainingBytes >= TyWidth) {
3700 lowerSet(Ty, Offset);
3701 RemainingBytes -= TyWidth;
3702 Offset -= TyWidth;
3670 } 3703 }
3671 3704
3672 // Use large vector operations 3705 if (RemainingBytes == 0)
3673 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) { 3706 return;
3674 N -= 16;
3675 lowerSet(IceType_v16i8, N);
3676 }
3677 lowerLeftOvers(CountValue & 0xF);
3678 return;
3679 }
3680 3707
3681 // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al? 3708 // Lower the remaining bytes. Adjust to larger types in order to make use
3682 if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) { 3709 // of overlaps in the copies.
3683 Base = legalizeToReg(Dest); 3710 constexpr bool Overflow = true;
3684 // 3 is the awkward size as it is too small for the vector or 32-bit 3711 Type LeftOverTy = typeForSize(RemainingBytes, Overflow);
3685 // operations and will not work with lowerLeftOvers as there is no valid 3712 Offset = CountValue - typeWidthInBytes(LeftOverTy);
3686 // overlap. 3713 lowerSet(LeftOverTy, Offset);
3687 if (CountValue == 3) {
3688 lowerSet(IceType_i16, 0);
3689 lowerSet(IceType_i8, 2);
3690 return;
3691 }
3692
3693 // TODO(ascull); 64-bit can do better with 64-bit mov
3694 for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) {
3695 N -= 4;
3696 lowerSet(IceType_i32, N);
3697 }
3698 lowerLeftOvers(CountValue & 0x3);
3699 return; 3714 return;
3700 } 3715 }
3701 } 3716 }
3702 3717
3703 // Fall back on calling the memset function. The value operand needs to be 3718 // Fall back on calling the memset function. The value operand needs to be
3704 // extended to a stack slot size because the PNaCl ABI requires arguments to 3719 // extended to a stack slot size because the PNaCl ABI requires arguments to
3705 // be at least 32 bits wide. 3720 // be at least 32 bits wide.
3706 Operand *ValExt; 3721 Operand *ValExt;
3707 if (IsValConst) { 3722 if (IsValConst) {
3708 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue); 3723 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
(...skipping 1453 matching lines...) Expand 10 before | Expand all | Expand 10 after
5162 } 5177 }
5163 // the offset is not eligible for blinding or pooling, return the original 5178 // the offset is not eligible for blinding or pooling, return the original
5164 // mem operand 5179 // mem operand
5165 return MemOperand; 5180 return MemOperand;
5166 } 5181 }
5167 5182
5168 } // end of namespace X86Internal 5183 } // end of namespace X86Internal
5169 } // end of namespace Ice 5184 } // end of namespace Ice
5170 5185
5171 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H 5186 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698