src/IceTargetLoweringX86BaseImpl.h - Issue 1278173009: Inline memove for small constant sizes and refactor memcpy and memset.

Side by Side Diff: src/IceTargetLoweringX86BaseImpl.h

Issue 1278173009: Inline memove for small constant sizes and refactor memcpy and memset. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master

Patch Set: Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -- C++ --==//	1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -- C++ --==//

2 //	2 //

3 // The Subzero Code Generator	3 // The Subzero Code Generator

4 //	4 //

5 // This file is distributed under the University of Illinois Open Source	5 // This file is distributed under the University of Illinois Open Source

6 // License. See LICENSE.TXT for details.	6 // License. See LICENSE.TXT for details.

7 //	7 //

8 //===----------------------------------------------------------------------===//	8 //===----------------------------------------------------------------------===//

9 ///	9 ///

10 /// \file	10 /// \file

(...skipping 3016 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3027 Call->addArg(Instr->getArg(0));	3027 Call->addArg(Instr->getArg(0));

3028 Call->addArg(Instr->getArg(1));	3028 Call->addArg(Instr->getArg(1));

3029 lowerCall(Call);	3029 lowerCall(Call);

3030 return;	3030 return;

3031 }	3031 }

3032 case Intrinsics::Memcpy: {	3032 case Intrinsics::Memcpy: {

3033 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));	3033 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));

3034 return;	3034 return;

3035 }	3035 }

3036 case Intrinsics::Memmove: {	3036 case Intrinsics::Memmove: {

3037 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);	3037 lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));

3038 Call->addArg(Instr->getArg(0));

3039 Call->addArg(Instr->getArg(1));

3040 Call->addArg(Instr->getArg(2));

3041 lowerCall(Call);

3042 return;	3038 return;

3043 }	3039 }

3044 case Intrinsics::Memset: {	3040 case Intrinsics::Memset: {

3045 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));	3041 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));

3046 return;	3042 return;

3047 }	3043 }

3048 case Intrinsics::NaClReadTP: {	3044 case Intrinsics::NaClReadTP: {

3049 if (Ctx->getFlags().getUseSandboxing()) {	3045 if (Ctx->getFlags().getUseSandboxing()) {

3050 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand);	3046 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand);

3051 Variable *Dest = Instr->getDest();	3047 Variable *Dest = Instr->getDest();

(...skipping 422 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3474 _bsr(T_Dest2, SecondVar);	3470 _bsr(T_Dest2, SecondVar);

3475 _xor(T_Dest2, ThirtyOne);	3471 _xor(T_Dest2, ThirtyOne);

3476 }	3472 }

3477 _test(SecondVar, SecondVar);	3473 _test(SecondVar, SecondVar);

3478 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e);	3474 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e);

3479 _mov(DestLo, T_Dest2);	3475 _mov(DestLo, T_Dest2);

3480 _mov(DestHi, Ctx->getConstantZero(IceType_i32));	3476 _mov(DestHi, Ctx->getConstantZero(IceType_i32));

3481 }	3477 }

3482	3478

3483 template <class Machine>	3479 template <class Machine>

	3480 Type TargetX86Base<Machine>::typeForSize(uint32_t Size, bool Overflow,
	Jim Stichnoth 2015/08/12 13:41:47 Is there anything machine-specific about this func Is there anything machine-specific about this function? If not, maybe move it to IceTypes.cpp. ascull 2015/08/17 22:18:52 Possibly, as we discussed. Show quoted text On 2015/08/12 13:41:47, stichnot wrote: > Is there anything machine-specific about this function? If not, maybe move it > to IceTypes.cpp. Possibly, as we discussed.
	3481 uint32_t LimitWidth) {

	3482 assert(Size != 0);

	3483 static const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32,

	3484 IceType_f64, IceType_v16i8};
	John 2015/08/12 17:44:57 See if Traits::Is64Bit ? IceType_i64 : IceType_f6 See if Traits::Is64Bit ? IceType_i64 : IceType_f64 works for 64 bit data types. If it does, than please use that instead. ascull 2015/08/17 22:18:53 That does work. And it sorts on of the TODOs, than Show quoted text On 2015/08/12 17:44:57, John wrote: > See if > > Traits::Is64Bit ? IceType_i64 : IceType_f64 > > works for 64 bit data types. If it does, than please use that instead. That does work. And it sorts on of the TODOs, thanks.
	3485 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);

	3486 if (Overflow && !llvm::isPowerOf2_32(Size))

	3487 ++TyIndex;

	3488 uint32_t MaxIndex =

	3489 LimitWidth ? llvm::findLastSet(LimitWidth, llvm::ZB_Undefined) : 4;
	Jim Stichnoth 2015/08/12 13:41:47 Can you use llvm::array_lengthof(TypeForSize)-1 in Can you use llvm::array_lengthof(TypeForSize)-1 instead of 4? (assuming that's correct and what's meant) ascull 2015/08/17 22:18:52 This is the function I tried and failed to find, T Show quoted text On 2015/08/12 13:41:47, stichnot wrote: > Can you use llvm::array_lengthof(TypeForSize)-1 instead of 4? > (assuming that's correct and what's meant) This is the function I tried and failed to find, Thanks! Done.
	3490 return TypeForSize[std::min(TyIndex, MaxIndex)];

	3491 }

	3492

	3493 template <class Machine>

	3494 void TargetX86Base<Machine>::lowerCopyMem(Type Ty, Variable *Dest,

	3495 Variable *Src, uint32_t OffsetAmt) {

	3496 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;

	3497 // TODO(ascull): this or add nullptr test to _movp, _movq

	3498 Variable *Data = makeReg(Ty);

	3499

	3500 // TODO(ascull): is 64-bit better with vector or scalar movq?

	3501 auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, Src, Offset);

	3502 if (isVectorType(Ty))

	3503 _movp(Data, SrcMem);

	3504 else if (Ty == IceType_f64)

	3505 _movq(Data, SrcMem);

	3506 else

	3507 _mov(Data, SrcMem);

	3508

	3509 auto *DestMem = Traits::X86OperandMem::create(Func, Ty, Dest, Offset);

	3510 if (isVectorType(Ty))

	3511 _storep(Data, DestMem);
	John 2015/08/12 17:44:57 just curious: why store[pq]\? instead of mov? you just curious: why store[pq]\? instead of mov? you could also do something like: if (isVectorType()) { _movp() _storep() } else if (Ty == i64) { _movq() _storeq() } else { _mov() _store() } ascull 2015/08/17 22:18:53 _store seems to be needed for writing to memory. I Show quoted text On 2015/08/12 17:44:57, John wrote: > just curious: why store[pq]\? instead of mov? > > you could also do something like: > > if (isVectorType()) { > _movp() > _storep() > } else if (Ty == i64) { > _movq() > _storeq() > } else { > _mov() > _store() > } _store seems to be needed for writing to memory. I guess it's just to make it clearer to read instead of trying to decipher the mov.
	3512 else if (Ty == IceType_f64)

	3513 _storeq(Data, DestMem);

	3514 else

	3515 _store(Data, DestMem);

	3516 }

	3517

	3518 template <class Machine>

3484 void TargetX86Base<Machine>::lowerMemcpy(Operand Dest, Operand Src,	3519 void TargetX86Base<Machine>::lowerMemcpy(Operand Dest, Operand Src,

3485 Operand *Count) {	3520 Operand *Count) {

3486 // There is a load and store for each chunk in the unroll	3521 // There is a load and store for each chunk in the unroll

3487 constexpr uint32_t UNROLL_LIMIT = 8;	3522 constexpr uint32_t UNROLL_LIMIT = 8;

3488 constexpr uint32_t BytesPerStorep = 16;	3523 constexpr uint32_t BytesPerStorep = 16;

3489 constexpr uint32_t BytesPerStoreq = 8;

3490 constexpr uint32_t BytesPerStorei32 = 4;

3491 constexpr uint32_t BytesPerStorei16 = 2;

3492 constexpr uint32_t BytesPerStorei8 = 1;

3493	3524

3494 // Check if the operands are constants	3525 // Check if the operands are constants

3495 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);	3526 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);

3496 const bool IsCountConst = CountConst != nullptr;	3527 const bool IsCountConst = CountConst != nullptr;

3497 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;	3528 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;

3498	3529

3499 if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) {	3530 if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) {

3500 // Unlikely, but nothing to do if it does happen	3531 // Unlikely, but nothing to do if it does happen

3501 if (CountValue == 0)	3532 if (CountValue == 0)

3502 return;	3533 return;

3503	3534

3504 Variable *SrcBase = legalizeToReg(Src);	3535 Variable *SrcBase = legalizeToReg(Src);

3505 Variable *DestBase = legalizeToReg(Dest);	3536 Variable *DestBase = legalizeToReg(Dest);

3506	3537

3507 auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) {	3538 // Find the largest type that can be used and use it as much as possible in

3508 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;	3539 // reverse order. Then handle any remainder with overlapping copies. Since

3509 // TODO(ascull): this or add nullptr test to _movp, _movq	3540 // the remainder will be at the end, there will be reduces pressure on the
	Jim Stichnoth 2015/08/12 13:41:47 reduced (I think) reduced (I think) ascull 2015/08/17 22:18:53 Done. Show quoted text On 2015/08/12 13:41:47, stichnot wrote: > reduced (I think) Done.
3510 Variable *Data = makeReg(Ty);	3541 // memory unit as the access to the same memory are far apart.
	Jim Stichnoth 2015/08/12 13:41:47 accesses ? accesses ? ascull 2015/08/17 22:18:53 Done. Show quoted text On 2015/08/12 13:41:47, stichnot wrote: > accesses ? Done.
	3542 Type Ty = typeForSize(CountValue);

	3543 uint32_t TyWidth = typeWidthInBytes(Ty);

3511	3544

3512 // TODO(ascull): is 64-bit better with vector or scalar movq?	3545 uint32_t RemainingBytes = CountValue;
	John 2015/08/12 17:44:57 do you really need uint32_t's here? unless you rea do you really need uint32_t's here? unless you really need the extra bit, please use signed integers. ascull 2015/08/17 22:18:53 The CountValue comes from a size_t so is unsigned. Show quoted text On 2015/08/12 17:44:57, John wrote: > do you really need uint32_t's here? unless you really need the extra bit, please > use signed integers. The CountValue comes from a size_t so is unsigned. I've changes Offset to int_32 to match getConstantInt32. What do you prefer signed?
3513 auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset);	3546 uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;

3514 if (isVectorType(Ty))	3547 while (RemainingBytes >= TyWidth) {

3515 _movp(Data, SrcMem);	3548 lowerCopyMem(Ty, DestBase, SrcBase, Offset);

3516 else if (Ty == IceType_f64)	3549 RemainingBytes -= TyWidth;

3517 _movq(Data, SrcMem);	3550 Offset -= TyWidth;

3518 else

3519 _mov(Data, SrcMem);

3520

3521 auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset);

3522 if (isVectorType(Ty))

3523 _storep(Data, DestMem);

3524 else if (Ty == IceType_f64)

3525 _storeq(Data, DestMem);

3526 else

3527 _store(Data, DestMem);

3528 };

3529

3530 // Lowers the assignment to the remaining bytes. Assumes the original size

3531 // was large enough to allow for overlaps.

3532 auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) {

3533 if (Size > BytesPerStoreq) {

3534 lowerCopy(IceType_v16i8, CountValue - BytesPerStorep);

3535 } else if (Size > BytesPerStorei32) {

3536 lowerCopy(IceType_f64, CountValue - BytesPerStoreq);

3537 } else if (Size > BytesPerStorei16) {

3538 lowerCopy(IceType_i32, CountValue - BytesPerStorei32);

3539 } else if (Size > BytesPerStorei8) {

3540 lowerCopy(IceType_i16, CountValue - BytesPerStorei16);

3541 } else if (Size == BytesPerStorei8) {

3542 lowerCopy(IceType_i8, CountValue - BytesPerStorei8);

3543 }

3544 };

3545

3546 if (CountValue >= BytesPerStorep) {

3547 // Use large vector operations

3548 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {

3549 N -= BytesPerStorep;

3550 lowerCopy(IceType_v16i8, N);

3551 }

3552 lowerLeftOvers(CountValue & 0xF);

3553 return;

3554 }	3551 }

3555	3552

3556 // Too small to use large vector operations so use small ones instead	3553 if (RemainingBytes == 0)

3557 if (CountValue >= BytesPerStoreq) {

3558 lowerCopy(IceType_f64, 0);

3559 lowerLeftOvers(CountValue - BytesPerStoreq);

3560 return;	3554 return;

3561 }

3562	3555

3563 // Too small for vector operations so use scalar ones	3556 // Lower the remaining bytes. Adjust to larger types in order to make use

3564 if (CountValue >= BytesPerStorei32) {	3557 // of overlaps in the copies.

3565 lowerCopy(IceType_i32, 0);	3558 constexpr bool Overflow = true;

3566 lowerLeftOvers(CountValue - BytesPerStorei32);	3559 Type LeftOverTy = typeForSize(RemainingBytes, Overflow);

3567 return;	3560 Offset = CountValue - typeWidthInBytes(LeftOverTy);
	John 2015/08/12 17:44:57 be careful with unaligned memory accesses. it migh be careful with unaligned memory accesses. it might be better so simply copy bytes? ascull 2015/08/17 22:18:53 This is the way gcc and clang do it. Since the ali Show quoted text On 2015/08/12 17:44:57, John wrote: > be careful with unaligned memory accesses. it might be better so simply copy > bytes? This is the way gcc and clang do it. Since the alignment will not be known there will be a large overhead in checking that. Doing it with bytes seems like it would take longer, especially if there are 15 bytes! We also don't know if the previous copies were aligned (hence the use of movups).
3568 }	3561 lowerCopyMem(LeftOverTy, DestBase, SrcBase, Offset);

3569

3570 // 3 is the awkward size as it is too small for the vector or 32-bit

3571 // operations and will not work with lowerLeftOvers as there is no valid

3572 // overlap.

3573 if (CountValue == 3) {

3574 lowerCopy(IceType_i16, 0);

3575 lowerCopy(IceType_i8, 2);

3576 return;

3577 }

3578

3579 // 1 or 2 can be done in a single scalar copy

3580 lowerLeftOvers(CountValue);

3581 return;	3562 return;

3582 }	3563 }

3583	3564

3584 // Fall back on a function call	3565 // Fall back on a function call

3585 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);	3566 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);

3586 Call->addArg(Dest);	3567 Call->addArg(Dest);

3587 Call->addArg(Src);	3568 Call->addArg(Src);

3588 Call->addArg(Count);	3569 Call->addArg(Count);

	3570 lowerCall(Call);

	3571 }

	3572

	3573 template <class Machine>

	3574 void TargetX86Base<Machine>::lowerMemmove(Operand Dest, Operand Src,

	3575 Operand *Count) {

	3576 // There is a load and store for each chunk in the unroll

	3577 constexpr uint32_t UNROLL_LIMIT = 8;

	3578 constexpr uint32_t BytesPerStorep = 16;

	3579

	3580 // Check if the operands are constants

	3581 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);

	3582 const bool IsCountConst = CountConst != nullptr;

	3583 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;

	3584

	3585 if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) {

	3586 // Unlikely, but nothing to do if it does happen

	3587 if (CountValue == 0)

	3588 return;

	3589

	3590 Variable *SrcBase = legalizeToReg(Src);

	3591 Variable *DestBase = legalizeToReg(Dest);

	3592

	3593 // Make sure to only copy down to avoid overlap issues. This means Src must

	3594 // have a greater address than Dest.

	3595 _cmp(SrcBase, DestBase);

	3596 typename Traits::Insts::Label *Label =

	3597 Traits::Insts::Label::create(Func, this);

	3598 _br(Traits::Cond::Br_b, Label);

	3599 _xchg(SrcBase, DestBase);

	3600 Context.insert(Label);

	3601

	3602 // We can't assume overlapping copies are safe as the copy may overwrite

	3603 // the source. This means we need to repeatedly select the largest size to

	3604 // copy that doesn't cause overlap.

	3605 uint32_t Size = CountValue;

	3606 uint32_t Offset = 0;

	3607 while (Size > 0) {

	3608 Type Ty = typeForSize(Size);

	3609 lowerCopyMem(Ty, DestBase, SrcBase, Offset);

	3610 Size -= typeWidthInBytes(Ty);

	3611 Offset += typeWidthInBytes(Ty);

	3612 }

	3613

	3614 return;

	3615 }

	3616

	3617 // Fall back on a function call

	3618 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);

	3619 Call->addArg(Dest);

	3620 Call->addArg(Src);

	3621 Call->addArg(Count);

3589 lowerCall(Call);	3622 lowerCall(Call);

3590 }	3623 }

3591	3624

3592 template <class Machine>	3625 template <class Machine>

3593 void TargetX86Base<Machine>::lowerMemset(Operand Dest, Operand Val,	3626 void TargetX86Base<Machine>::lowerMemset(Operand Dest, Operand Val,

3594 Operand *Count) {	3627 Operand *Count) {

3595 constexpr uint32_t UNROLL_LIMIT = 16;	3628 constexpr uint32_t UNROLL_LIMIT = 16;

3596 constexpr uint32_t BytesPerStorep = 16;	3629 constexpr uint32_t BytesPerStorep = 16;

3597 constexpr uint32_t BytesPerStoreq = 8;	3630 constexpr uint32_t BytesPerStoreq = 8;

3598 constexpr uint32_t BytesPerStorei32 = 4;	3631 constexpr uint32_t BytesPerStorei32 = 4;

3599 constexpr uint32_t BytesPerStorei16 = 2;

3600 constexpr uint32_t BytesPerStorei8 = 1;

3601 assert(Val->getType() == IceType_i8);	3632 assert(Val->getType() == IceType_i8);

3602	3633

3603 // Check if the operands are constants	3634 // Check if the operands are constants

3604 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);	3635 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);

3605 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);	3636 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);

3606 const bool IsCountConst = CountConst != nullptr;	3637 const bool IsCountConst = CountConst != nullptr;

3607 const bool IsValConst = ValConst != nullptr;	3638 const bool IsValConst = ValConst != nullptr;

3608 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;	3639 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;

3609 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;	3640 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;

3610	3641

3611 // Unlikely, but nothing to do if it does happen	3642 // Unlikely, but nothing to do if it does happen

3612 if (IsCountConst && CountValue == 0)	3643 if (IsCountConst && CountValue == 0)

3613 return;	3644 return;

3614	3645

3615 // TODO(ascull): if the count is constant but val is not it would be possible	3646 // TODO(ascull): if the count is constant but val is not it would be possible

3616 // to inline by spreading the value across 4 bytes and accessing subregs e.g.	3647 // to inline by spreading the value across 4 bytes and accessing subregs e.g.

3617 // eax, ax and al.	3648 // eax, ax and al.

3618 if (IsCountConst && IsValConst) {	3649 if (IsCountConst && IsValConst) {

3619 Variable *Base = nullptr;	3650 Variable *Base = nullptr;

	3651 Variable *VecReg = nullptr;

3620 const uint32_t SpreadValue =	3652 const uint32_t SpreadValue =

3621 (ValValue << 24) \| (ValValue << 16) \| (ValValue << 8) \| ValValue;	3653 (ValValue << 24) \| (ValValue << 16) \| (ValValue << 8) \| ValValue;

3622 Variable *VecReg = nullptr;

3623	3654

3624 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,	3655 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,

3625 uint32_t OffsetAmt) {	3656 uint32_t OffsetAmt) {

3626 assert(Base != nullptr);	3657 assert(Base != nullptr);

3627 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;	3658 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;

3628	3659

3629 // TODO(ascull): is 64-bit better with vector or scalar movq?	3660 // TODO(ascull): is 64-bit better with vector or scalar movq?

3630 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);	3661 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);

3631 if (isVectorType(Ty)) {	3662 if (isVectorType(Ty)) {

3632 assert(VecReg != nullptr);	3663 assert(VecReg != nullptr);

3633 _storep(VecReg, Mem);	3664 _storep(VecReg, Mem);

3634 } else if (Ty == IceType_i64) {	3665 } else if (Ty == IceType_f64) {

3635 assert(VecReg != nullptr);	3666 assert(VecReg != nullptr);

3636 _storeq(VecReg, Mem);	3667 _storeq(VecReg, Mem);

3637 } else {	3668 } else {

3638 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);	3669 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);

3639 }	3670 }

3640 };	3671 };

3641	3672

3642 // Lowers the assignment to the remaining bytes. Assumes the original size	3673 // Find the largest type that can be used and use it as much as possible in

3643 // was large enough to allow for overlaps.	3674 // reverse order. Then handle any remainder with overlapping copies. Since

3644 auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) {	3675 // the remainder will be at the end, there will be reduces pressure on the

3645 if (Size > BytesPerStoreq) {	3676 // memory unit as the access to the same memory are far apart.

3646 lowerSet(IceType_v16i8, CountValue - BytesPerStorep);	3677 Type Ty;

3647 } else if (Size > BytesPerStorei32) {

3648 lowerSet(IceType_i64, CountValue - BytesPerStoreq);

3649 } else if (Size > BytesPerStorei16) {

3650 lowerSet(IceType_i32, CountValue - BytesPerStorei32);

3651 } else if (Size > BytesPerStorei8) {

3652 lowerSet(IceType_i16, CountValue - BytesPerStorei16);

3653 } else if (Size == BytesPerStorei8) {

3654 lowerSet(IceType_i8, CountValue - BytesPerStorei8);

3655 }

3656 };

3657

3658 // When the value is zero it can be loaded into a vector register cheaply

3659 // using the xor trick.

3660 if (ValValue == 0 && CountValue >= BytesPerStoreq &&	3678 if (ValValue == 0 && CountValue >= BytesPerStoreq &&

3661 CountValue <= BytesPerStorep * UNROLL_LIMIT) {	3679 CountValue <= BytesPerStorep * UNROLL_LIMIT) {

	3680 // When the value is zero it can be loaded into a vector register cheaply

	3681 // using the xor trick.

3662 Base = legalizeToReg(Dest);	3682 Base = legalizeToReg(Dest);

3663 VecReg = makeVectorOfZeros(IceType_v16i8);	3683 VecReg = makeVectorOfZeros(IceType_v16i8);

	3684 Ty = typeForSize(CountValue);

	3685 } else if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) {

	3686 // When the value is non-zero or the count is small we can't use vector

	3687 // instructions so are limits to 32-bit stores.

	3688 Base = legalizeToReg(Dest);

	3689 constexpr bool Offset = false;

	3690 constexpr uint32_t LimitWidth = 4;

	3691 Ty = typeForSize(CountValue, Offset, LimitWidth);

	3692 }

3664	3693

3665 // Too small to use large vector operations so use small ones instead	3694 if (Base) {

3666 if (CountValue < BytesPerStorep) {	3695 uint32_t TyWidth = typeWidthInBytes(Ty);

3667 lowerSet(IceType_i64, 0);	3696

3668 lowerLeftOvers(CountValue - BytesPerStoreq);	3697 uint32_t RemainingBytes = CountValue;

3669 return;	3698 uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;

	3699 while (RemainingBytes >= TyWidth) {

	3700 lowerSet(Ty, Offset);

	3701 RemainingBytes -= TyWidth;

	3702 Offset -= TyWidth;

3670 }	3703 }

3671	3704

3672 // Use large vector operations	3705 if (RemainingBytes == 0)

3673 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {	3706 return;

3674 N -= 16;

3675 lowerSet(IceType_v16i8, N);

3676 }

3677 lowerLeftOvers(CountValue & 0xF);

3678 return;

3679 }

3680	3707

3681 // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al?	3708 // Lower the remaining bytes. Adjust to larger types in order to make use

3682 if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) {	3709 // of overlaps in the copies.

3683 Base = legalizeToReg(Dest);	3710 constexpr bool Overflow = true;

3684 // 3 is the awkward size as it is too small for the vector or 32-bit	3711 Type LeftOverTy = typeForSize(RemainingBytes, Overflow);

3685 // operations and will not work with lowerLeftOvers as there is no valid	3712 Offset = CountValue - typeWidthInBytes(LeftOverTy);

3686 // overlap.	3713 lowerSet(LeftOverTy, Offset);

3687 if (CountValue == 3) {

3688 lowerSet(IceType_i16, 0);

3689 lowerSet(IceType_i8, 2);

3690 return;

3691 }

3692

3693 // TODO(ascull); 64-bit can do better with 64-bit mov

3694 for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) {

3695 N -= 4;

3696 lowerSet(IceType_i32, N);

3697 }

3698 lowerLeftOvers(CountValue & 0x3);

3699 return;	3714 return;

3700 }	3715 }

3701 }	3716 }

3702	3717

3703 // Fall back on calling the memset function. The value operand needs to be	3718 // Fall back on calling the memset function. The value operand needs to be

3704 // extended to a stack slot size because the PNaCl ABI requires arguments to	3719 // extended to a stack slot size because the PNaCl ABI requires arguments to

3705 // be at least 32 bits wide.	3720 // be at least 32 bits wide.

3706 Operand *ValExt;	3721 Operand *ValExt;

3707 if (IsValConst) {	3722 if (IsValConst) {

3708 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);	3723 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);

(...skipping 1453 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5162 }	5177 }

5163 // the offset is not eligible for blinding or pooling, return the original	5178 // the offset is not eligible for blinding or pooling, return the original

5164 // mem operand	5179 // mem operand

5165 return MemOperand;	5180 return MemOperand;

5166 }	5181 }

5167	5182

5168 } // end of namespace X86Internal	5183 } // end of namespace X86Internal

5169 } // end of namespace Ice	5184 } // end of namespace Ice

5170	5185

5171 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H	5186 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H

OLD	NEW

« src/IceTargetLoweringX86Base.h ('K') | « src/IceTargetLoweringX86Base.h ('k') | tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll » ('j') | tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll » ('J')