src/IceTargetLoweringX86BaseImpl.h - Issue 1279833005: Inline memcpy for small constant sizes.

Side by Side Diff: src/IceTargetLoweringX86BaseImpl.h

Issue 1279833005: Inline memcpy for small constant sizes. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master

Patch Set: Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -- C++ --==//	1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -- C++ --==//

2 //	2 //

3 // The Subzero Code Generator	3 // The Subzero Code Generator

4 //	4 //

5 // This file is distributed under the University of Illinois Open Source	5 // This file is distributed under the University of Illinois Open Source

6 // License. See LICENSE.TXT for details.	6 // License. See LICENSE.TXT for details.

7 //	7 //

8 //===----------------------------------------------------------------------===//	8 //===----------------------------------------------------------------------===//

9 ///	9 ///

10 /// \file	10 /// \file

(...skipping 3012 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3023 return;	3023 return;

3024 }	3024 }

3025 case Intrinsics::Longjmp: {	3025 case Intrinsics::Longjmp: {

3026 InstCall *Call = makeHelperCall(H_call_longjmp, nullptr, 2);	3026 InstCall *Call = makeHelperCall(H_call_longjmp, nullptr, 2);

3027 Call->addArg(Instr->getArg(0));	3027 Call->addArg(Instr->getArg(0));

3028 Call->addArg(Instr->getArg(1));	3028 Call->addArg(Instr->getArg(1));

3029 lowerCall(Call);	3029 lowerCall(Call);

3030 return;	3030 return;

3031 }	3031 }

3032 case Intrinsics::Memcpy: {	3032 case Intrinsics::Memcpy: {

3033 // In the future, we could potentially emit an inline memcpy/memset, etc.	3033 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));

3034 // for intrinsic calls w/ a known length.

3035 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);

3036 Call->addArg(Instr->getArg(0));

3037 Call->addArg(Instr->getArg(1));

3038 Call->addArg(Instr->getArg(2));

3039 lowerCall(Call);

3040 return;	3034 return;

3041 }	3035 }

3042 case Intrinsics::Memmove: {	3036 case Intrinsics::Memmove: {

3043 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);	3037 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);

3044 Call->addArg(Instr->getArg(0));	3038 Call->addArg(Instr->getArg(0));

3045 Call->addArg(Instr->getArg(1));	3039 Call->addArg(Instr->getArg(1));

3046 Call->addArg(Instr->getArg(2));	3040 Call->addArg(Instr->getArg(2));

3047 lowerCall(Call);	3041 lowerCall(Call);

3048 return;	3042 return;

3049 }	3043 }

(...skipping 430 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3480 _bsr(T_Dest2, SecondVar);	3474 _bsr(T_Dest2, SecondVar);

3481 _xor(T_Dest2, ThirtyOne);	3475 _xor(T_Dest2, ThirtyOne);

3482 }	3476 }

3483 _test(SecondVar, SecondVar);	3477 _test(SecondVar, SecondVar);

3484 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e);	3478 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e);

3485 _mov(DestLo, T_Dest2);	3479 _mov(DestLo, T_Dest2);

3486 _mov(DestHi, Ctx->getConstantZero(IceType_i32));	3480 _mov(DestHi, Ctx->getConstantZero(IceType_i32));

3487 }	3481 }

3488	3482

3489 template <class Machine>	3483 template <class Machine>

	3484 void TargetX86Base<Machine>::lowerMemcpy(Operand Dest, Operand Src,

	3485 Operand *Count) {

	3486 // There is a load and store for each chunk in the unroll

	3487 constexpr uint32_t UNROLL_LIMIT = 8;

	3488 constexpr uint32_t BytesPerStorep = 16;

	3489 constexpr uint32_t BytesPerStoreq = 8;

	3490 constexpr uint32_t BytesPerStorei32 = 4;

	3491 constexpr uint32_t BytesPerStorei16 = 2;

	3492 constexpr uint32_t BytesPerStorei8 = 1;

	3493

	3494 // Check if the operands are constants

	3495 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);

	3496 const bool IsCountConst = CountConst != nullptr;

	3497 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;

	3498

	3499 if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) {

	3500 // Unlikely, but nothing to do if it does happen

	3501 if (CountValue == 0)

	3502 return;

	3503

	3504 Variable *SrcBase = legalizeToReg(Src);

	3505 Variable *DestBase = legalizeToReg(Dest);

	3506

	3507 auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) {

	3508 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;

	3509 // TODO(ascull): this or add nullptr test to _movp, _movq

	3510 Variable *Data = makeReg(Ty);

	3511

	3512 // TODO(ascull): is 64-bit better with vector or scalar movq?

	3513 auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset);

	3514 if (isVectorType(Ty))

	3515 _movp(Data, SrcMem);

	3516 else if (Ty == IceType_f64)

	3517 _movq(Data, SrcMem);

	3518 else

	3519 _mov(Data, SrcMem);

	3520

	3521 auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset);

	3522 if (isVectorType(Ty))

	3523 _storep(Data, DestMem);

	3524 else if (Ty == IceType_f64)

	3525 _storeq(Data, DestMem);

	3526 else

	3527 _store(Data, DestMem);

	3528 };

	3529

	3530 // Lowers the assignment to the remaining bytes. Assumes the original size

	3531 // was large enough to allow for overlaps.

	3532 auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) {

	3533 if (Size > BytesPerStoreq) {

	3534 lowerCopy(IceType_v16i8, CountValue - BytesPerStorep);

	3535 } else if (Size > BytesPerStorei32) {

	3536 lowerCopy(IceType_f64, CountValue - BytesPerStoreq);

	3537 } else if (Size > BytesPerStorei16) {

	3538 lowerCopy(IceType_i32, CountValue - BytesPerStorei32);

	3539 } else if (Size > BytesPerStorei8) {

	3540 lowerCopy(IceType_i16, CountValue - BytesPerStorei16);

	3541 } else if (Size == BytesPerStorei8) {

	3542 lowerCopy(IceType_i8, CountValue - BytesPerStorei8);

	3543 }

	3544 };

	3545

	3546 if (CountValue >= BytesPerStorep) {

	3547 // Use large vector operations

	3548 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {

	3549 N -= BytesPerStorep;

	3550 lowerCopy(IceType_v16i8, N);

	3551 }

	3552 lowerLeftOvers(CountValue & 0xF);

	3553 return;

	3554 }

	3555

	3556 // Too small to use large vector operations so use small ones instead

	3557 if (CountValue >= BytesPerStoreq) {

	3558 lowerCopy(IceType_f64, 0);

	3559 lowerLeftOvers(CountValue - BytesPerStoreq);

	3560 return;

	3561 }

	3562

	3563 // Too small for vector operations so use scalar ones

	3564 if (CountValue >= BytesPerStorei32) {

	3565 lowerCopy(IceType_i32, 0);

	3566 lowerLeftOvers(CountValue - BytesPerStorei32);

	3567 return;

	3568 }

	3569

	3570 // 3 is the awkward size as it is too small for the vector or 32-bit

	3571 // operations and will not work with lowerLeftOvers as there is no valid

	3572 // overlap.

	3573 if (CountValue == 3) {

	3574 lowerCopy(IceType_i16, 0);

	3575 lowerCopy(IceType_i8, 2);

	3576 return;

	3577 }

	3578

	3579 // 1 or 2 can be done in a single scalar copy

	3580 lowerLeftOvers(CountValue);

	3581 return;

	3582 }

	3583

	3584 // Fall back on a function call

	3585 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);

	3586 Call->addArg(Dest);

	3587 Call->addArg(Src);

	3588 Call->addArg(Count);

	3589 lowerCall(Call);

	3590 }

	3591

	3592 template <class Machine>

3490 void TargetX86Base<Machine>::lowerMemset(Operand Dest, Operand Val,	3593 void TargetX86Base<Machine>::lowerMemset(Operand Dest, Operand Val,

3491 Operand *Count) {	3594 Operand *Count) {

3492 constexpr uint32_t UNROLL_LIMIT = 16;	3595 constexpr uint32_t UNROLL_LIMIT = 16;

	3596 constexpr uint32_t BytesPerStorep = 16;

	3597 constexpr uint32_t BytesPerStoreq = 8;

	3598 constexpr uint32_t BytesPerStorei32 = 4;

	3599 constexpr uint32_t BytesPerStorei16 = 2;

	3600 constexpr uint32_t BytesPerStorei8 = 1;

3493 assert(Val->getType() == IceType_i8);	3601 assert(Val->getType() == IceType_i8);

3494	3602

3495 // Check if the operands are constants	3603 // Check if the operands are constants

3496 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);	3604 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);

3497 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);	3605 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);

3498 const bool IsCountConst = CountConst != nullptr;	3606 const bool IsCountConst = CountConst != nullptr;

3499 const bool IsValConst = ValConst != nullptr;	3607 const bool IsValConst = ValConst != nullptr;

3500 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;	3608 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;

3501 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;	3609 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;

3502	3610

3503 // Unlikely, but nothing to do if it does happen	3611 // Unlikely, but nothing to do if it does happen

3504 if (IsCountConst && CountValue == 0)	3612 if (IsCountConst && CountValue == 0)

3505 return;	3613 return;

3506	3614

3507 // TODO(ascull): if the count is constant but val is not it would be possible	3615 // TODO(ascull): if the count is constant but val is not it would be possible

3508 // to inline by spreading the value across 4 bytes and accessing subregs e.g.	3616 // to inline by spreading the value across 4 bytes and accessing subregs e.g.

3509 // eax, ax and al.	3617 // eax, ax and al.

3510 if (IsCountConst && IsValConst) {	3618 if (IsCountConst && IsValConst) {

3511 Variable *Base = legalizeToReg(Dest);	3619 Variable *Base = nullptr;

3512 // Add a FakeUse in case Base is ultimately not used, e.g. it falls back to	3620 const uint32_t SpreadValue =

3513 // calling memset(). Otherwise Om1 register allocation fails because this	3621 (ValValue << 24) \| (ValValue << 16) \| (ValValue << 8) \| ValValue;

3514 // infinite-weight variable has a definition but no uses.	3622 Variable *VecReg = nullptr;

3515 Context.insert(InstFakeUse::create(Func, Base));

3516	3623

3517 // 3 is the awkward size as it is too small for the vector or 32-bit	3624 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,

3518 // operations and will not work with lowerLeftOvers as there is no valid	3625 uint32_t OffsetAmt) {

3519 // overlap.	3626 assert(Base != nullptr);

3520 if (CountValue == 3) {	3627 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;

3521 Constant *Offset = nullptr;

3522 auto *Mem =

3523 Traits::X86OperandMem::create(Func, IceType_i16, Base, Offset);

3524 _store(Ctx->getConstantInt16((ValValue << 8) \| ValValue), Mem);

3525	3628

3526 Offset = Ctx->getConstantInt8(2);	3629 // TODO(ascull): is 64-bit better with vector or scalar movq?

3527 Mem = Traits::X86OperandMem::create(Func, IceType_i8, Base, Offset);	3630 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);

3528 _store(Ctx->getConstantInt8(ValValue), Mem);	3631 if (isVectorType(Ty)) {

3529 return;	3632 assert(VecReg != nullptr);

3530 }	3633 _storep(VecReg, Mem);

	3634 } else if (Ty == IceType_i64) {

	3635 assert(VecReg != nullptr);

	3636 _storeq(VecReg, Mem);

	3637 } else {

	3638 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);

	3639 }

	3640 };

3531	3641

3532 // Lowers the assignment to the remaining bytes. Assumes the original size	3642 // Lowers the assignment to the remaining bytes. Assumes the original size

3533 // was large enough to allow for overlaps.	3643 // was large enough to allow for overlaps.

3534 auto lowerLeftOvers = [this, Base, CountValue](	3644 auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) {

3535 uint32_t SpreadValue, uint32_t Size, Variable *VecReg) {	3645 if (Size > BytesPerStoreq) {

3536 auto lowerStoreSpreadValue =	3646 lowerSet(IceType_v16i8, CountValue - BytesPerStorep);

3537 [this, Base, CountValue, SpreadValue](Type Ty) {	3647 } else if (Size > BytesPerStorei32) {

3538 Constant *Offset =	3648 lowerSet(IceType_i64, CountValue - BytesPerStoreq);

3539 Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));	3649 } else if (Size > BytesPerStorei16) {

3540 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);	3650 lowerSet(IceType_i32, CountValue - BytesPerStorei32);

3541 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);	3651 } else if (Size > BytesPerStorei8) {

3542 };	3652 lowerSet(IceType_i16, CountValue - BytesPerStorei16);

3543	3653 } else if (Size == BytesPerStorei8) {

3544 if (Size > 8) {	3654 lowerSet(IceType_i8, CountValue - BytesPerStorei8);

3545 assert(VecReg != nullptr);

3546 Constant *Offset = Ctx->getConstantInt32(CountValue - 16);

3547 auto *Mem = Traits::X86OperandMem::create(Func, VecReg->getType(), Base,

3548 Offset);

3549 _storep(VecReg, Mem);

3550 } else if (Size > 4) {

3551 assert(VecReg != nullptr);

3552 Constant *Offset = Ctx->getConstantInt32(CountValue - 8);

3553 auto *Mem =

3554 Traits::X86OperandMem::create(Func, IceType_i64, Base, Offset);

3555 _storeq(VecReg, Mem);

3556 } else if (Size > 2) {

3557 lowerStoreSpreadValue(IceType_i32);

3558 } else if (Size > 1) {

3559 lowerStoreSpreadValue(IceType_i16);

3560 } else if (Size == 1) {

3561 lowerStoreSpreadValue(IceType_i8);

3562 }	3655 }

3563 };	3656 };

3564	3657

3565 // When the value is zero it can be loaded into a register cheaply using	3658 // When the value is zero it can be loaded into a vector register cheaply

3566 // the xor trick.	3659 // using the xor trick.

3567 constexpr uint32_t BytesPerStorep = 16;	3660 if (ValValue == 0 && CountValue >= BytesPerStoreq &&

3568 if (ValValue == 0 && CountValue >= 8 &&

3569 CountValue <= BytesPerStorep * UNROLL_LIMIT) {	3661 CountValue <= BytesPerStorep * UNROLL_LIMIT) {

3570 Variable *Zero = makeVectorOfZeros(IceType_v16i8);	3662 Base = legalizeToReg(Dest);

	3663 VecReg = makeVectorOfZeros(IceType_v16i8);

3571	3664

3572 // Too small to use large vector operations so use small ones instead	3665 // Too small to use large vector operations so use small ones instead

3573 if (CountValue < 16) {	3666 if (CountValue < BytesPerStorep) {

3574 Constant *Offset = nullptr;	3667 lowerSet(IceType_i64, 0);

3575 auto *Mem =	3668 lowerLeftOvers(CountValue - BytesPerStoreq);

3576 Traits::X86OperandMem::create(Func, IceType_i64, Base, Offset);

3577 _storeq(Zero, Mem);

3578 lowerLeftOvers(0, CountValue - 8, Zero);

3579 return;	3669 return;

3580 }	3670 }

3581	3671

3582 assert(CountValue >= 16);

3583 // Use large vector operations	3672 // Use large vector operations

3584 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {	3673 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {

3585 N -= 16;	3674 N -= 16;

3586 Constant *Offset = Ctx->getConstantInt32(N);	3675 lowerSet(IceType_v16i8, N);

3587 auto *Mem =

3588 Traits::X86OperandMem::create(Func, Zero->getType(), Base, Offset);

3589 _storep(Zero, Mem);

3590 }	3676 }

3591 uint32_t LeftOver = CountValue & 0xF;	3677 lowerLeftOvers(CountValue & 0xF);

3592 lowerLeftOvers(0, LeftOver, Zero);

3593 return;	3678 return;

3594 }	3679 }

3595	3680

3596 // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al?	3681 // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al?

3597 constexpr uint32_t BytesPerStore = 4;	3682 if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) {

3598 if (CountValue <= BytesPerStore * UNROLL_LIMIT) {	3683 Base = legalizeToReg(Dest);

	3684 // 3 is the awkward size as it is too small for the vector or 32-bit

	3685 // operations and will not work with lowerLeftOvers as there is no valid

	3686 // overlap.

	3687 if (CountValue == 3) {

	3688 lowerSet(IceType_i16, 0);

	3689 lowerSet(IceType_i8, 2);

	3690 return;

	3691 }

	3692

3599 // TODO(ascull); 64-bit can do better with 64-bit mov	3693 // TODO(ascull); 64-bit can do better with 64-bit mov

3600 uint32_t SpreadValue =	3694 for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) {

3601 (ValValue << 24) \| (ValValue << 16) \| (ValValue << 8) \| ValValue;	3695 N -= 4;

3602 if (CountValue >= 4) {	3696 lowerSet(IceType_i32, N);

3603 Constant *ValueConst = Ctx->getConstantInt32(SpreadValue);

3604 for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) {

3605 N -= 4;

3606 Constant *Offset = Ctx->getConstantInt32(N);

3607 auto *Mem =

3608 Traits::X86OperandMem::create(Func, IceType_i32, Base, Offset);

3609 _store(ValueConst, Mem);

3610 }

3611 }	3697 }

3612 uint32_t LeftOver = CountValue & 0x3;	3698 lowerLeftOvers(CountValue & 0x3);

3613 lowerLeftOvers(SpreadValue, LeftOver, nullptr);

3614 return;	3699 return;

3615 }	3700 }

3616 }	3701 }

3617	3702

3618 // Fall back on calling the memset function. The value operand needs to be	3703 // Fall back on calling the memset function. The value operand needs to be

3619 // extended to a stack slot size because the PNaCl ABI requires arguments to	3704 // extended to a stack slot size because the PNaCl ABI requires arguments to

3620 // be at least 32 bits wide.	3705 // be at least 32 bits wide.

3621 Operand *ValExt;	3706 Operand *ValExt;

3622 if (IsValConst) {	3707 if (IsValConst) {

3623 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);	3708 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);

(...skipping 1453 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5077 }	5162 }

5078 // the offset is not eligible for blinding or pooling, return the original	5163 // the offset is not eligible for blinding or pooling, return the original

5079 // mem operand	5164 // mem operand

5080 return MemOperand;	5165 return MemOperand;

5081 }	5166 }

5082	5167

5083 } // end of namespace X86Internal	5168 } // end of namespace X86Internal

5084 } // end of namespace Ice	5169 } // end of namespace Ice

5085	5170

5086 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H	5171 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H

OLD	NEW

« no previous file with comments | « src/IceTargetLoweringX86Base.h ('k') | tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll » ('j') | tests_lit/llvm2ice_tests/nacl-mem-intrinsics.ll » ('J')