Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(121)

Side by Side Diff: src/IceTargetLoweringX86BaseImpl.h

Issue 1279833005: Inline memcpy for small constant sizes. (Closed) Base URL: https://chromium.googlesource.com/native_client/pnacl-subzero.git@master
Patch Set: Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// 1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==//
2 // 2 //
3 // The Subzero Code Generator 3 // The Subzero Code Generator
4 // 4 //
5 // This file is distributed under the University of Illinois Open Source 5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details. 6 // License. See LICENSE.TXT for details.
7 // 7 //
8 //===----------------------------------------------------------------------===// 8 //===----------------------------------------------------------------------===//
9 /// 9 ///
10 /// \file 10 /// \file
(...skipping 3012 matching lines...) Expand 10 before | Expand all | Expand 10 after
3023 return; 3023 return;
3024 } 3024 }
3025 case Intrinsics::Longjmp: { 3025 case Intrinsics::Longjmp: {
3026 InstCall *Call = makeHelperCall(H_call_longjmp, nullptr, 2); 3026 InstCall *Call = makeHelperCall(H_call_longjmp, nullptr, 2);
3027 Call->addArg(Instr->getArg(0)); 3027 Call->addArg(Instr->getArg(0));
3028 Call->addArg(Instr->getArg(1)); 3028 Call->addArg(Instr->getArg(1));
3029 lowerCall(Call); 3029 lowerCall(Call);
3030 return; 3030 return;
3031 } 3031 }
3032 case Intrinsics::Memcpy: { 3032 case Intrinsics::Memcpy: {
3033 // In the future, we could potentially emit an inline memcpy/memset, etc. 3033 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
3034 // for intrinsic calls w/ a known length.
3035 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);
3036 Call->addArg(Instr->getArg(0));
3037 Call->addArg(Instr->getArg(1));
3038 Call->addArg(Instr->getArg(2));
3039 lowerCall(Call);
3040 return; 3034 return;
3041 } 3035 }
3042 case Intrinsics::Memmove: { 3036 case Intrinsics::Memmove: {
3043 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3); 3037 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);
3044 Call->addArg(Instr->getArg(0)); 3038 Call->addArg(Instr->getArg(0));
3045 Call->addArg(Instr->getArg(1)); 3039 Call->addArg(Instr->getArg(1));
3046 Call->addArg(Instr->getArg(2)); 3040 Call->addArg(Instr->getArg(2));
3047 lowerCall(Call); 3041 lowerCall(Call);
3048 return; 3042 return;
3049 } 3043 }
(...skipping 430 matching lines...) Expand 10 before | Expand all | Expand 10 after
3480 _bsr(T_Dest2, SecondVar); 3474 _bsr(T_Dest2, SecondVar);
3481 _xor(T_Dest2, ThirtyOne); 3475 _xor(T_Dest2, ThirtyOne);
3482 } 3476 }
3483 _test(SecondVar, SecondVar); 3477 _test(SecondVar, SecondVar);
3484 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e); 3478 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e);
3485 _mov(DestLo, T_Dest2); 3479 _mov(DestLo, T_Dest2);
3486 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); 3480 _mov(DestHi, Ctx->getConstantZero(IceType_i32));
3487 } 3481 }
3488 3482
3489 template <class Machine> 3483 template <class Machine>
3484 void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src,
3485 Operand *Count) {
3486 // There is a load and store for each chunk in the unroll
3487 constexpr uint32_t UNROLL_LIMIT = 8;
3488 constexpr uint32_t BytesPerStorep = 16;
3489 constexpr uint32_t BytesPerStoreq = 8;
3490 constexpr uint32_t BytesPerStorei32 = 4;
3491 constexpr uint32_t BytesPerStorei16 = 2;
3492 constexpr uint32_t BytesPerStorei8 = 1;
3493
3494 // Check if the operands are constants
3495 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
3496 const bool IsCountConst = CountConst != nullptr;
3497 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
3498
3499 if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) {
3500 // Unlikely, but nothing to do if it does happen
3501 if (CountValue == 0)
3502 return;
3503
3504 Variable *SrcBase = legalizeToReg(Src);
3505 Variable *DestBase = legalizeToReg(Dest);
3506
3507 auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) {
3508 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
3509 // TODO(ascull): this or add nullptr test to _movp, _movq
3510 Variable *Data = makeReg(Ty);
3511
3512 // TODO(ascull): is 64-bit better with vector or scalar movq?
3513 auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset);
3514 if (isVectorType(Ty))
3515 _movp(Data, SrcMem);
3516 else if (Ty == IceType_f64)
3517 _movq(Data, SrcMem);
3518 else
3519 _mov(Data, SrcMem);
3520
3521 auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset);
3522 if (isVectorType(Ty))
3523 _storep(Data, DestMem);
3524 else if (Ty == IceType_f64)
3525 _storeq(Data, DestMem);
3526 else
3527 _store(Data, DestMem);
3528 };
3529
3530 // Lowers the assignment to the remaining bytes. Assumes the original size
3531 // was large enough to allow for overlaps.
3532 auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) {
3533 if (Size > BytesPerStoreq) {
3534 lowerCopy(IceType_v16i8, CountValue - BytesPerStorep);
3535 } else if (Size > BytesPerStorei32) {
3536 lowerCopy(IceType_f64, CountValue - BytesPerStoreq);
3537 } else if (Size > BytesPerStorei16) {
3538 lowerCopy(IceType_i32, CountValue - BytesPerStorei32);
3539 } else if (Size > BytesPerStorei8) {
3540 lowerCopy(IceType_i16, CountValue - BytesPerStorei16);
3541 } else if (Size == BytesPerStorei8) {
3542 lowerCopy(IceType_i8, CountValue - BytesPerStorei8);
3543 }
3544 };
3545
3546 if (CountValue >= BytesPerStorep) {
3547 // Use large vector operations
3548 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {
3549 N -= BytesPerStorep;
3550 lowerCopy(IceType_v16i8, N);
3551 }
3552 lowerLeftOvers(CountValue & 0xF);
3553 return;
3554 }
3555
3556 // Too small to use large vector operations so use small ones instead
3557 if (CountValue >= BytesPerStoreq) {
3558 lowerCopy(IceType_f64, 0);
3559 lowerLeftOvers(CountValue - BytesPerStoreq);
3560 return;
3561 }
3562
3563 // Too small for vector operations so use scalar ones
3564 if (CountValue >= BytesPerStorei32) {
3565 lowerCopy(IceType_i32, 0);
3566 lowerLeftOvers(CountValue - BytesPerStorei32);
3567 return;
3568 }
3569
3570 // 3 is the awkward size as it is too small for the vector or 32-bit
3571 // operations and will not work with lowerLeftOvers as there is no valid
3572 // overlap.
3573 if (CountValue == 3) {
3574 lowerCopy(IceType_i16, 0);
3575 lowerCopy(IceType_i8, 2);
3576 return;
3577 }
3578
3579 // 1 or 2 can be done in a single scalar copy
3580 lowerLeftOvers(CountValue);
3581 return;
3582 }
3583
3584 // Fall back on a function call
3585 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);
3586 Call->addArg(Dest);
3587 Call->addArg(Src);
3588 Call->addArg(Count);
3589 lowerCall(Call);
3590 }
3591
3592 template <class Machine>
3490 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, 3593 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,
3491 Operand *Count) { 3594 Operand *Count) {
3492 constexpr uint32_t UNROLL_LIMIT = 16; 3595 constexpr uint32_t UNROLL_LIMIT = 16;
3596 constexpr uint32_t BytesPerStorep = 16;
3597 constexpr uint32_t BytesPerStoreq = 8;
3598 constexpr uint32_t BytesPerStorei32 = 4;
3599 constexpr uint32_t BytesPerStorei16 = 2;
3600 constexpr uint32_t BytesPerStorei8 = 1;
3493 assert(Val->getType() == IceType_i8); 3601 assert(Val->getType() == IceType_i8);
3494 3602
3495 // Check if the operands are constants 3603 // Check if the operands are constants
3496 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); 3604 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
3497 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val); 3605 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
3498 const bool IsCountConst = CountConst != nullptr; 3606 const bool IsCountConst = CountConst != nullptr;
3499 const bool IsValConst = ValConst != nullptr; 3607 const bool IsValConst = ValConst != nullptr;
3500 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; 3608 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
3501 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0; 3609 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
3502 3610
3503 // Unlikely, but nothing to do if it does happen 3611 // Unlikely, but nothing to do if it does happen
3504 if (IsCountConst && CountValue == 0) 3612 if (IsCountConst && CountValue == 0)
3505 return; 3613 return;
3506 3614
3507 // TODO(ascull): if the count is constant but val is not it would be possible 3615 // TODO(ascull): if the count is constant but val is not it would be possible
3508 // to inline by spreading the value across 4 bytes and accessing subregs e.g. 3616 // to inline by spreading the value across 4 bytes and accessing subregs e.g.
3509 // eax, ax and al. 3617 // eax, ax and al.
3510 if (IsCountConst && IsValConst) { 3618 if (IsCountConst && IsValConst) {
3511 Variable *Base = legalizeToReg(Dest); 3619 Variable *Base = nullptr;
3512 // Add a FakeUse in case Base is ultimately not used, e.g. it falls back to 3620 const uint32_t SpreadValue =
3513 // calling memset(). Otherwise Om1 register allocation fails because this 3621 (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue;
3514 // infinite-weight variable has a definition but no uses. 3622 Variable *VecReg = nullptr;
3515 Context.insert(InstFakeUse::create(Func, Base));
3516 3623
3517 // 3 is the awkward size as it is too small for the vector or 32-bit 3624 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
3518 // operations and will not work with lowerLeftOvers as there is no valid 3625 uint32_t OffsetAmt) {
3519 // overlap. 3626 assert(Base != nullptr);
3520 if (CountValue == 3) { 3627 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
3521 Constant *Offset = nullptr;
3522 auto *Mem =
3523 Traits::X86OperandMem::create(Func, IceType_i16, Base, Offset);
3524 _store(Ctx->getConstantInt16((ValValue << 8) | ValValue), Mem);
3525 3628
3526 Offset = Ctx->getConstantInt8(2); 3629 // TODO(ascull): is 64-bit better with vector or scalar movq?
3527 Mem = Traits::X86OperandMem::create(Func, IceType_i8, Base, Offset); 3630 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset);
3528 _store(Ctx->getConstantInt8(ValValue), Mem); 3631 if (isVectorType(Ty)) {
3529 return; 3632 assert(VecReg != nullptr);
3530 } 3633 _storep(VecReg, Mem);
3634 } else if (Ty == IceType_i64) {
3635 assert(VecReg != nullptr);
3636 _storeq(VecReg, Mem);
3637 } else {
3638 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
3639 }
3640 };
3531 3641
3532 // Lowers the assignment to the remaining bytes. Assumes the original size 3642 // Lowers the assignment to the remaining bytes. Assumes the original size
3533 // was large enough to allow for overlaps. 3643 // was large enough to allow for overlaps.
3534 auto lowerLeftOvers = [this, Base, CountValue]( 3644 auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) {
3535 uint32_t SpreadValue, uint32_t Size, Variable *VecReg) { 3645 if (Size > BytesPerStoreq) {
3536 auto lowerStoreSpreadValue = 3646 lowerSet(IceType_v16i8, CountValue - BytesPerStorep);
3537 [this, Base, CountValue, SpreadValue](Type Ty) { 3647 } else if (Size > BytesPerStorei32) {
3538 Constant *Offset = 3648 lowerSet(IceType_i64, CountValue - BytesPerStoreq);
3539 Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty)); 3649 } else if (Size > BytesPerStorei16) {
3540 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); 3650 lowerSet(IceType_i32, CountValue - BytesPerStorei32);
3541 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem); 3651 } else if (Size > BytesPerStorei8) {
3542 }; 3652 lowerSet(IceType_i16, CountValue - BytesPerStorei16);
3543 3653 } else if (Size == BytesPerStorei8) {
3544 if (Size > 8) { 3654 lowerSet(IceType_i8, CountValue - BytesPerStorei8);
3545 assert(VecReg != nullptr);
3546 Constant *Offset = Ctx->getConstantInt32(CountValue - 16);
3547 auto *Mem = Traits::X86OperandMem::create(Func, VecReg->getType(), Base,
3548 Offset);
3549 _storep(VecReg, Mem);
3550 } else if (Size > 4) {
3551 assert(VecReg != nullptr);
3552 Constant *Offset = Ctx->getConstantInt32(CountValue - 8);
3553 auto *Mem =
3554 Traits::X86OperandMem::create(Func, IceType_i64, Base, Offset);
3555 _storeq(VecReg, Mem);
3556 } else if (Size > 2) {
3557 lowerStoreSpreadValue(IceType_i32);
3558 } else if (Size > 1) {
3559 lowerStoreSpreadValue(IceType_i16);
3560 } else if (Size == 1) {
3561 lowerStoreSpreadValue(IceType_i8);
3562 } 3655 }
3563 }; 3656 };
3564 3657
3565 // When the value is zero it can be loaded into a register cheaply using 3658 // When the value is zero it can be loaded into a vector register cheaply
3566 // the xor trick. 3659 // using the xor trick.
3567 constexpr uint32_t BytesPerStorep = 16; 3660 if (ValValue == 0 && CountValue >= BytesPerStoreq &&
3568 if (ValValue == 0 && CountValue >= 8 &&
3569 CountValue <= BytesPerStorep * UNROLL_LIMIT) { 3661 CountValue <= BytesPerStorep * UNROLL_LIMIT) {
3570 Variable *Zero = makeVectorOfZeros(IceType_v16i8); 3662 Base = legalizeToReg(Dest);
3663 VecReg = makeVectorOfZeros(IceType_v16i8);
3571 3664
3572 // Too small to use large vector operations so use small ones instead 3665 // Too small to use large vector operations so use small ones instead
3573 if (CountValue < 16) { 3666 if (CountValue < BytesPerStorep) {
3574 Constant *Offset = nullptr; 3667 lowerSet(IceType_i64, 0);
3575 auto *Mem = 3668 lowerLeftOvers(CountValue - BytesPerStoreq);
3576 Traits::X86OperandMem::create(Func, IceType_i64, Base, Offset);
3577 _storeq(Zero, Mem);
3578 lowerLeftOvers(0, CountValue - 8, Zero);
3579 return; 3669 return;
3580 } 3670 }
3581 3671
3582 assert(CountValue >= 16);
3583 // Use large vector operations 3672 // Use large vector operations
3584 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) { 3673 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) {
3585 N -= 16; 3674 N -= 16;
3586 Constant *Offset = Ctx->getConstantInt32(N); 3675 lowerSet(IceType_v16i8, N);
3587 auto *Mem =
3588 Traits::X86OperandMem::create(Func, Zero->getType(), Base, Offset);
3589 _storep(Zero, Mem);
3590 } 3676 }
3591 uint32_t LeftOver = CountValue & 0xF; 3677 lowerLeftOvers(CountValue & 0xF);
3592 lowerLeftOvers(0, LeftOver, Zero);
3593 return; 3678 return;
3594 } 3679 }
3595 3680
3596 // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al? 3681 // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al?
3597 constexpr uint32_t BytesPerStore = 4; 3682 if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) {
3598 if (CountValue <= BytesPerStore * UNROLL_LIMIT) { 3683 Base = legalizeToReg(Dest);
3684 // 3 is the awkward size as it is too small for the vector or 32-bit
3685 // operations and will not work with lowerLeftOvers as there is no valid
3686 // overlap.
3687 if (CountValue == 3) {
3688 lowerSet(IceType_i16, 0);
3689 lowerSet(IceType_i8, 2);
3690 return;
3691 }
3692
3599 // TODO(ascull); 64-bit can do better with 64-bit mov 3693 // TODO(ascull); 64-bit can do better with 64-bit mov
3600 uint32_t SpreadValue = 3694 for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) {
3601 (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue; 3695 N -= 4;
3602 if (CountValue >= 4) { 3696 lowerSet(IceType_i32, N);
3603 Constant *ValueConst = Ctx->getConstantInt32(SpreadValue);
3604 for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) {
3605 N -= 4;
3606 Constant *Offset = Ctx->getConstantInt32(N);
3607 auto *Mem =
3608 Traits::X86OperandMem::create(Func, IceType_i32, Base, Offset);
3609 _store(ValueConst, Mem);
3610 }
3611 } 3697 }
3612 uint32_t LeftOver = CountValue & 0x3; 3698 lowerLeftOvers(CountValue & 0x3);
3613 lowerLeftOvers(SpreadValue, LeftOver, nullptr);
3614 return; 3699 return;
3615 } 3700 }
3616 } 3701 }
3617 3702
3618 // Fall back on calling the memset function. The value operand needs to be 3703 // Fall back on calling the memset function. The value operand needs to be
3619 // extended to a stack slot size because the PNaCl ABI requires arguments to 3704 // extended to a stack slot size because the PNaCl ABI requires arguments to
3620 // be at least 32 bits wide. 3705 // be at least 32 bits wide.
3621 Operand *ValExt; 3706 Operand *ValExt;
3622 if (IsValConst) { 3707 if (IsValConst) {
3623 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue); 3708 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
(...skipping 1453 matching lines...) Expand 10 before | Expand all | Expand 10 after
5077 } 5162 }
5078 // the offset is not eligible for blinding or pooling, return the original 5163 // the offset is not eligible for blinding or pooling, return the original
5079 // mem operand 5164 // mem operand
5080 return MemOperand; 5165 return MemOperand;
5081 } 5166 }
5082 5167
5083 } // end of namespace X86Internal 5168 } // end of namespace X86Internal
5084 } // end of namespace Ice 5169 } // end of namespace Ice
5085 5170
5086 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H 5171 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698