Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// | 1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// |
| 2 // | 2 // |
| 3 // The Subzero Code Generator | 3 // The Subzero Code Generator |
| 4 // | 4 // |
| 5 // This file is distributed under the University of Illinois Open Source | 5 // This file is distributed under the University of Illinois Open Source |
| 6 // License. See LICENSE.TXT for details. | 6 // License. See LICENSE.TXT for details. |
| 7 // | 7 // |
| 8 //===----------------------------------------------------------------------===// | 8 //===----------------------------------------------------------------------===// |
| 9 /// | 9 /// |
| 10 /// \file | 10 /// \file |
| (...skipping 3016 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3027 Call->addArg(Instr->getArg(0)); | 3027 Call->addArg(Instr->getArg(0)); |
| 3028 Call->addArg(Instr->getArg(1)); | 3028 Call->addArg(Instr->getArg(1)); |
| 3029 lowerCall(Call); | 3029 lowerCall(Call); |
| 3030 return; | 3030 return; |
| 3031 } | 3031 } |
| 3032 case Intrinsics::Memcpy: { | 3032 case Intrinsics::Memcpy: { |
| 3033 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); | 3033 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); |
| 3034 return; | 3034 return; |
| 3035 } | 3035 } |
| 3036 case Intrinsics::Memmove: { | 3036 case Intrinsics::Memmove: { |
| 3037 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3); | 3037 lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); |
| 3038 Call->addArg(Instr->getArg(0)); | |
| 3039 Call->addArg(Instr->getArg(1)); | |
| 3040 Call->addArg(Instr->getArg(2)); | |
| 3041 lowerCall(Call); | |
| 3042 return; | 3038 return; |
| 3043 } | 3039 } |
| 3044 case Intrinsics::Memset: { | 3040 case Intrinsics::Memset: { |
| 3045 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); | 3041 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); |
| 3046 return; | 3042 return; |
| 3047 } | 3043 } |
| 3048 case Intrinsics::NaClReadTP: { | 3044 case Intrinsics::NaClReadTP: { |
| 3049 if (Ctx->getFlags().getUseSandboxing()) { | 3045 if (Ctx->getFlags().getUseSandboxing()) { |
| 3050 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand); | 3046 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand); |
| 3051 Variable *Dest = Instr->getDest(); | 3047 Variable *Dest = Instr->getDest(); |
| (...skipping 422 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3474 _bsr(T_Dest2, SecondVar); | 3470 _bsr(T_Dest2, SecondVar); |
| 3475 _xor(T_Dest2, ThirtyOne); | 3471 _xor(T_Dest2, ThirtyOne); |
| 3476 } | 3472 } |
| 3477 _test(SecondVar, SecondVar); | 3473 _test(SecondVar, SecondVar); |
| 3478 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e); | 3474 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e); |
| 3479 _mov(DestLo, T_Dest2); | 3475 _mov(DestLo, T_Dest2); |
| 3480 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); | 3476 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); |
| 3481 } | 3477 } |
| 3482 | 3478 |
| 3483 template <class Machine> | 3479 template <class Machine> |
| 3480 Type TargetX86Base<Machine>::typeForSize(uint32_t Size, bool Overflow, | |
|
Jim Stichnoth
2015/08/12 13:41:47
Is there anything machine-specific about this func
ascull
2015/08/17 22:18:52
Possibly, as we discussed.
| |
| 3481 uint32_t LimitWidth) { | |
| 3482 assert(Size != 0); | |
| 3483 static const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32, | |
| 3484 IceType_f64, IceType_v16i8}; | |
|
John
2015/08/12 17:44:57
See if
Traits::Is64Bit ? IceType_i64 : IceType_f6
ascull
2015/08/17 22:18:53
That does work. And it sorts on of the TODOs, than
| |
| 3485 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined); | |
| 3486 if (Overflow && !llvm::isPowerOf2_32(Size)) | |
| 3487 ++TyIndex; | |
| 3488 uint32_t MaxIndex = | |
| 3489 LimitWidth ? llvm::findLastSet(LimitWidth, llvm::ZB_Undefined) : 4; | |
|
Jim Stichnoth
2015/08/12 13:41:47
Can you use llvm::array_lengthof(TypeForSize)-1 in
ascull
2015/08/17 22:18:52
This is the function I tried and failed to find, T
| |
| 3490 return TypeForSize[std::min(TyIndex, MaxIndex)]; | |
| 3491 } | |
| 3492 | |
| 3493 template <class Machine> | |
| 3494 void TargetX86Base<Machine>::lowerCopyMem(Type Ty, Variable *Dest, | |
| 3495 Variable *Src, uint32_t OffsetAmt) { | |
| 3496 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; | |
| 3497 // TODO(ascull): this or add nullptr test to _movp, _movq | |
| 3498 Variable *Data = makeReg(Ty); | |
| 3499 | |
| 3500 // TODO(ascull): is 64-bit better with vector or scalar movq? | |
| 3501 auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, Src, Offset); | |
| 3502 if (isVectorType(Ty)) | |
| 3503 _movp(Data, SrcMem); | |
| 3504 else if (Ty == IceType_f64) | |
| 3505 _movq(Data, SrcMem); | |
| 3506 else | |
| 3507 _mov(Data, SrcMem); | |
| 3508 | |
| 3509 auto *DestMem = Traits::X86OperandMem::create(Func, Ty, Dest, Offset); | |
| 3510 if (isVectorType(Ty)) | |
| 3511 _storep(Data, DestMem); | |
|
John
2015/08/12 17:44:57
just curious: why store[pq]\? instead of mov?
you
ascull
2015/08/17 22:18:53
_store seems to be needed for writing to memory. I
| |
| 3512 else if (Ty == IceType_f64) | |
| 3513 _storeq(Data, DestMem); | |
| 3514 else | |
| 3515 _store(Data, DestMem); | |
| 3516 } | |
| 3517 | |
| 3518 template <class Machine> | |
| 3484 void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src, | 3519 void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src, |
| 3485 Operand *Count) { | 3520 Operand *Count) { |
| 3486 // There is a load and store for each chunk in the unroll | 3521 // There is a load and store for each chunk in the unroll |
| 3487 constexpr uint32_t UNROLL_LIMIT = 8; | 3522 constexpr uint32_t UNROLL_LIMIT = 8; |
| 3488 constexpr uint32_t BytesPerStorep = 16; | 3523 constexpr uint32_t BytesPerStorep = 16; |
| 3489 constexpr uint32_t BytesPerStoreq = 8; | |
| 3490 constexpr uint32_t BytesPerStorei32 = 4; | |
| 3491 constexpr uint32_t BytesPerStorei16 = 2; | |
| 3492 constexpr uint32_t BytesPerStorei8 = 1; | |
| 3493 | 3524 |
| 3494 // Check if the operands are constants | 3525 // Check if the operands are constants |
| 3495 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); | 3526 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); |
| 3496 const bool IsCountConst = CountConst != nullptr; | 3527 const bool IsCountConst = CountConst != nullptr; |
| 3497 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; | 3528 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; |
| 3498 | 3529 |
| 3499 if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) { | 3530 if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) { |
| 3500 // Unlikely, but nothing to do if it does happen | 3531 // Unlikely, but nothing to do if it does happen |
| 3501 if (CountValue == 0) | 3532 if (CountValue == 0) |
| 3502 return; | 3533 return; |
| 3503 | 3534 |
| 3504 Variable *SrcBase = legalizeToReg(Src); | 3535 Variable *SrcBase = legalizeToReg(Src); |
| 3505 Variable *DestBase = legalizeToReg(Dest); | 3536 Variable *DestBase = legalizeToReg(Dest); |
| 3506 | 3537 |
| 3507 auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) { | 3538 // Find the largest type that can be used and use it as much as possible in |
| 3508 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; | 3539 // reverse order. Then handle any remainder with overlapping copies. Since |
| 3509 // TODO(ascull): this or add nullptr test to _movp, _movq | 3540 // the remainder will be at the end, there will be reduces pressure on the |
|
Jim Stichnoth
2015/08/12 13:41:47
reduced (I think)
ascull
2015/08/17 22:18:53
Done.
| |
| 3510 Variable *Data = makeReg(Ty); | 3541 // memory unit as the access to the same memory are far apart. |
|
Jim Stichnoth
2015/08/12 13:41:47
accesses ?
ascull
2015/08/17 22:18:53
Done.
| |
| 3542 Type Ty = typeForSize(CountValue); | |
| 3543 uint32_t TyWidth = typeWidthInBytes(Ty); | |
| 3511 | 3544 |
| 3512 // TODO(ascull): is 64-bit better with vector or scalar movq? | 3545 uint32_t RemainingBytes = CountValue; |
|
John
2015/08/12 17:44:57
do you really need uint32_t's here? unless you rea
ascull
2015/08/17 22:18:53
The CountValue comes from a size_t so is unsigned.
| |
| 3513 auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset); | 3546 uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth; |
| 3514 if (isVectorType(Ty)) | 3547 while (RemainingBytes >= TyWidth) { |
| 3515 _movp(Data, SrcMem); | 3548 lowerCopyMem(Ty, DestBase, SrcBase, Offset); |
| 3516 else if (Ty == IceType_f64) | 3549 RemainingBytes -= TyWidth; |
| 3517 _movq(Data, SrcMem); | 3550 Offset -= TyWidth; |
| 3518 else | |
| 3519 _mov(Data, SrcMem); | |
| 3520 | |
| 3521 auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset); | |
| 3522 if (isVectorType(Ty)) | |
| 3523 _storep(Data, DestMem); | |
| 3524 else if (Ty == IceType_f64) | |
| 3525 _storeq(Data, DestMem); | |
| 3526 else | |
| 3527 _store(Data, DestMem); | |
| 3528 }; | |
| 3529 | |
| 3530 // Lowers the assignment to the remaining bytes. Assumes the original size | |
| 3531 // was large enough to allow for overlaps. | |
| 3532 auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) { | |
| 3533 if (Size > BytesPerStoreq) { | |
| 3534 lowerCopy(IceType_v16i8, CountValue - BytesPerStorep); | |
| 3535 } else if (Size > BytesPerStorei32) { | |
| 3536 lowerCopy(IceType_f64, CountValue - BytesPerStoreq); | |
| 3537 } else if (Size > BytesPerStorei16) { | |
| 3538 lowerCopy(IceType_i32, CountValue - BytesPerStorei32); | |
| 3539 } else if (Size > BytesPerStorei8) { | |
| 3540 lowerCopy(IceType_i16, CountValue - BytesPerStorei16); | |
| 3541 } else if (Size == BytesPerStorei8) { | |
| 3542 lowerCopy(IceType_i8, CountValue - BytesPerStorei8); | |
| 3543 } | |
| 3544 }; | |
| 3545 | |
| 3546 if (CountValue >= BytesPerStorep) { | |
| 3547 // Use large vector operations | |
| 3548 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) { | |
| 3549 N -= BytesPerStorep; | |
| 3550 lowerCopy(IceType_v16i8, N); | |
| 3551 } | |
| 3552 lowerLeftOvers(CountValue & 0xF); | |
| 3553 return; | |
| 3554 } | 3551 } |
| 3555 | 3552 |
| 3556 // Too small to use large vector operations so use small ones instead | 3553 if (RemainingBytes == 0) |
| 3557 if (CountValue >= BytesPerStoreq) { | |
| 3558 lowerCopy(IceType_f64, 0); | |
| 3559 lowerLeftOvers(CountValue - BytesPerStoreq); | |
| 3560 return; | 3554 return; |
| 3561 } | |
| 3562 | 3555 |
| 3563 // Too small for vector operations so use scalar ones | 3556 // Lower the remaining bytes. Adjust to larger types in order to make use |
| 3564 if (CountValue >= BytesPerStorei32) { | 3557 // of overlaps in the copies. |
| 3565 lowerCopy(IceType_i32, 0); | 3558 constexpr bool Overflow = true; |
| 3566 lowerLeftOvers(CountValue - BytesPerStorei32); | 3559 Type LeftOverTy = typeForSize(RemainingBytes, Overflow); |
| 3567 return; | 3560 Offset = CountValue - typeWidthInBytes(LeftOverTy); |
|
John
2015/08/12 17:44:57
be careful with unaligned memory accesses. it migh
ascull
2015/08/17 22:18:53
This is the way gcc and clang do it. Since the ali
| |
| 3568 } | 3561 lowerCopyMem(LeftOverTy, DestBase, SrcBase, Offset); |
| 3569 | |
| 3570 // 3 is the awkward size as it is too small for the vector or 32-bit | |
| 3571 // operations and will not work with lowerLeftOvers as there is no valid | |
| 3572 // overlap. | |
| 3573 if (CountValue == 3) { | |
| 3574 lowerCopy(IceType_i16, 0); | |
| 3575 lowerCopy(IceType_i8, 2); | |
| 3576 return; | |
| 3577 } | |
| 3578 | |
| 3579 // 1 or 2 can be done in a single scalar copy | |
| 3580 lowerLeftOvers(CountValue); | |
| 3581 return; | 3562 return; |
| 3582 } | 3563 } |
| 3583 | 3564 |
| 3584 // Fall back on a function call | 3565 // Fall back on a function call |
| 3585 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3); | 3566 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3); |
| 3586 Call->addArg(Dest); | 3567 Call->addArg(Dest); |
| 3587 Call->addArg(Src); | 3568 Call->addArg(Src); |
| 3588 Call->addArg(Count); | 3569 Call->addArg(Count); |
| 3570 lowerCall(Call); | |
| 3571 } | |
| 3572 | |
| 3573 template <class Machine> | |
| 3574 void TargetX86Base<Machine>::lowerMemmove(Operand *Dest, Operand *Src, | |
| 3575 Operand *Count) { | |
| 3576 // There is a load and store for each chunk in the unroll | |
| 3577 constexpr uint32_t UNROLL_LIMIT = 8; | |
| 3578 constexpr uint32_t BytesPerStorep = 16; | |
| 3579 | |
| 3580 // Check if the operands are constants | |
| 3581 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); | |
| 3582 const bool IsCountConst = CountConst != nullptr; | |
| 3583 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; | |
| 3584 | |
| 3585 if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) { | |
| 3586 // Unlikely, but nothing to do if it does happen | |
| 3587 if (CountValue == 0) | |
| 3588 return; | |
| 3589 | |
| 3590 Variable *SrcBase = legalizeToReg(Src); | |
| 3591 Variable *DestBase = legalizeToReg(Dest); | |
| 3592 | |
| 3593 // Make sure to only copy down to avoid overlap issues. This means Src must | |
| 3594 // have a greater address than Dest. | |
| 3595 _cmp(SrcBase, DestBase); | |
| 3596 typename Traits::Insts::Label *Label = | |
| 3597 Traits::Insts::Label::create(Func, this); | |
| 3598 _br(Traits::Cond::Br_b, Label); | |
| 3599 _xchg(SrcBase, DestBase); | |
| 3600 Context.insert(Label); | |
| 3601 | |
| 3602 // We can't assume overlapping copies are safe as the copy may overwrite | |
| 3603 // the source. This means we need to repeatedly select the largest size to | |
| 3604 // copy that doesn't cause overlap. | |
| 3605 uint32_t Size = CountValue; | |
| 3606 uint32_t Offset = 0; | |
| 3607 while (Size > 0) { | |
| 3608 Type Ty = typeForSize(Size); | |
| 3609 lowerCopyMem(Ty, DestBase, SrcBase, Offset); | |
| 3610 Size -= typeWidthInBytes(Ty); | |
| 3611 Offset += typeWidthInBytes(Ty); | |
| 3612 } | |
| 3613 | |
| 3614 return; | |
| 3615 } | |
| 3616 | |
| 3617 // Fall back on a function call | |
| 3618 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3); | |
| 3619 Call->addArg(Dest); | |
| 3620 Call->addArg(Src); | |
| 3621 Call->addArg(Count); | |
| 3589 lowerCall(Call); | 3622 lowerCall(Call); |
| 3590 } | 3623 } |
| 3591 | 3624 |
| 3592 template <class Machine> | 3625 template <class Machine> |
| 3593 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, | 3626 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, |
| 3594 Operand *Count) { | 3627 Operand *Count) { |
| 3595 constexpr uint32_t UNROLL_LIMIT = 16; | 3628 constexpr uint32_t UNROLL_LIMIT = 16; |
| 3596 constexpr uint32_t BytesPerStorep = 16; | 3629 constexpr uint32_t BytesPerStorep = 16; |
| 3597 constexpr uint32_t BytesPerStoreq = 8; | 3630 constexpr uint32_t BytesPerStoreq = 8; |
| 3598 constexpr uint32_t BytesPerStorei32 = 4; | 3631 constexpr uint32_t BytesPerStorei32 = 4; |
| 3599 constexpr uint32_t BytesPerStorei16 = 2; | |
| 3600 constexpr uint32_t BytesPerStorei8 = 1; | |
| 3601 assert(Val->getType() == IceType_i8); | 3632 assert(Val->getType() == IceType_i8); |
| 3602 | 3633 |
| 3603 // Check if the operands are constants | 3634 // Check if the operands are constants |
| 3604 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); | 3635 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); |
| 3605 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val); | 3636 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val); |
| 3606 const bool IsCountConst = CountConst != nullptr; | 3637 const bool IsCountConst = CountConst != nullptr; |
| 3607 const bool IsValConst = ValConst != nullptr; | 3638 const bool IsValConst = ValConst != nullptr; |
| 3608 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; | 3639 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; |
| 3609 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0; | 3640 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0; |
| 3610 | 3641 |
| 3611 // Unlikely, but nothing to do if it does happen | 3642 // Unlikely, but nothing to do if it does happen |
| 3612 if (IsCountConst && CountValue == 0) | 3643 if (IsCountConst && CountValue == 0) |
| 3613 return; | 3644 return; |
| 3614 | 3645 |
| 3615 // TODO(ascull): if the count is constant but val is not it would be possible | 3646 // TODO(ascull): if the count is constant but val is not it would be possible |
| 3616 // to inline by spreading the value across 4 bytes and accessing subregs e.g. | 3647 // to inline by spreading the value across 4 bytes and accessing subregs e.g. |
| 3617 // eax, ax and al. | 3648 // eax, ax and al. |
| 3618 if (IsCountConst && IsValConst) { | 3649 if (IsCountConst && IsValConst) { |
| 3619 Variable *Base = nullptr; | 3650 Variable *Base = nullptr; |
| 3651 Variable *VecReg = nullptr; | |
| 3620 const uint32_t SpreadValue = | 3652 const uint32_t SpreadValue = |
| 3621 (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue; | 3653 (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue; |
| 3622 Variable *VecReg = nullptr; | |
| 3623 | 3654 |
| 3624 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty, | 3655 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty, |
| 3625 uint32_t OffsetAmt) { | 3656 uint32_t OffsetAmt) { |
| 3626 assert(Base != nullptr); | 3657 assert(Base != nullptr); |
| 3627 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; | 3658 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; |
| 3628 | 3659 |
| 3629 // TODO(ascull): is 64-bit better with vector or scalar movq? | 3660 // TODO(ascull): is 64-bit better with vector or scalar movq? |
| 3630 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); | 3661 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); |
| 3631 if (isVectorType(Ty)) { | 3662 if (isVectorType(Ty)) { |
| 3632 assert(VecReg != nullptr); | 3663 assert(VecReg != nullptr); |
| 3633 _storep(VecReg, Mem); | 3664 _storep(VecReg, Mem); |
| 3634 } else if (Ty == IceType_i64) { | 3665 } else if (Ty == IceType_f64) { |
| 3635 assert(VecReg != nullptr); | 3666 assert(VecReg != nullptr); |
| 3636 _storeq(VecReg, Mem); | 3667 _storeq(VecReg, Mem); |
| 3637 } else { | 3668 } else { |
| 3638 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem); | 3669 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem); |
| 3639 } | 3670 } |
| 3640 }; | 3671 }; |
| 3641 | 3672 |
| 3642 // Lowers the assignment to the remaining bytes. Assumes the original size | 3673 // Find the largest type that can be used and use it as much as possible in |
| 3643 // was large enough to allow for overlaps. | 3674 // reverse order. Then handle any remainder with overlapping copies. Since |
| 3644 auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) { | 3675 // the remainder will be at the end, there will be reduces pressure on the |
| 3645 if (Size > BytesPerStoreq) { | 3676 // memory unit as the access to the same memory are far apart. |
| 3646 lowerSet(IceType_v16i8, CountValue - BytesPerStorep); | 3677 Type Ty; |
| 3647 } else if (Size > BytesPerStorei32) { | |
| 3648 lowerSet(IceType_i64, CountValue - BytesPerStoreq); | |
| 3649 } else if (Size > BytesPerStorei16) { | |
| 3650 lowerSet(IceType_i32, CountValue - BytesPerStorei32); | |
| 3651 } else if (Size > BytesPerStorei8) { | |
| 3652 lowerSet(IceType_i16, CountValue - BytesPerStorei16); | |
| 3653 } else if (Size == BytesPerStorei8) { | |
| 3654 lowerSet(IceType_i8, CountValue - BytesPerStorei8); | |
| 3655 } | |
| 3656 }; | |
| 3657 | |
| 3658 // When the value is zero it can be loaded into a vector register cheaply | |
| 3659 // using the xor trick. | |
| 3660 if (ValValue == 0 && CountValue >= BytesPerStoreq && | 3678 if (ValValue == 0 && CountValue >= BytesPerStoreq && |
| 3661 CountValue <= BytesPerStorep * UNROLL_LIMIT) { | 3679 CountValue <= BytesPerStorep * UNROLL_LIMIT) { |
| 3680 // When the value is zero it can be loaded into a vector register cheaply | |
| 3681 // using the xor trick. | |
| 3662 Base = legalizeToReg(Dest); | 3682 Base = legalizeToReg(Dest); |
| 3663 VecReg = makeVectorOfZeros(IceType_v16i8); | 3683 VecReg = makeVectorOfZeros(IceType_v16i8); |
| 3684 Ty = typeForSize(CountValue); | |
| 3685 } else if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) { | |
| 3686 // When the value is non-zero or the count is small we can't use vector | |
| 3687 // instructions so are limits to 32-bit stores. | |
| 3688 Base = legalizeToReg(Dest); | |
| 3689 constexpr bool Offset = false; | |
| 3690 constexpr uint32_t LimitWidth = 4; | |
| 3691 Ty = typeForSize(CountValue, Offset, LimitWidth); | |
| 3692 } | |
| 3664 | 3693 |
| 3665 // Too small to use large vector operations so use small ones instead | 3694 if (Base) { |
| 3666 if (CountValue < BytesPerStorep) { | 3695 uint32_t TyWidth = typeWidthInBytes(Ty); |
| 3667 lowerSet(IceType_i64, 0); | 3696 |
| 3668 lowerLeftOvers(CountValue - BytesPerStoreq); | 3697 uint32_t RemainingBytes = CountValue; |
| 3669 return; | 3698 uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth; |
| 3699 while (RemainingBytes >= TyWidth) { | |
| 3700 lowerSet(Ty, Offset); | |
| 3701 RemainingBytes -= TyWidth; | |
| 3702 Offset -= TyWidth; | |
| 3670 } | 3703 } |
| 3671 | 3704 |
| 3672 // Use large vector operations | 3705 if (RemainingBytes == 0) |
| 3673 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) { | 3706 return; |
| 3674 N -= 16; | |
| 3675 lowerSet(IceType_v16i8, N); | |
| 3676 } | |
| 3677 lowerLeftOvers(CountValue & 0xF); | |
| 3678 return; | |
| 3679 } | |
| 3680 | 3707 |
| 3681 // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al? | 3708 // Lower the remaining bytes. Adjust to larger types in order to make use |
| 3682 if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) { | 3709 // of overlaps in the copies. |
| 3683 Base = legalizeToReg(Dest); | 3710 constexpr bool Overflow = true; |
| 3684 // 3 is the awkward size as it is too small for the vector or 32-bit | 3711 Type LeftOverTy = typeForSize(RemainingBytes, Overflow); |
| 3685 // operations and will not work with lowerLeftOvers as there is no valid | 3712 Offset = CountValue - typeWidthInBytes(LeftOverTy); |
| 3686 // overlap. | 3713 lowerSet(LeftOverTy, Offset); |
| 3687 if (CountValue == 3) { | |
| 3688 lowerSet(IceType_i16, 0); | |
| 3689 lowerSet(IceType_i8, 2); | |
| 3690 return; | |
| 3691 } | |
| 3692 | |
| 3693 // TODO(ascull); 64-bit can do better with 64-bit mov | |
| 3694 for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) { | |
| 3695 N -= 4; | |
| 3696 lowerSet(IceType_i32, N); | |
| 3697 } | |
| 3698 lowerLeftOvers(CountValue & 0x3); | |
| 3699 return; | 3714 return; |
| 3700 } | 3715 } |
| 3701 } | 3716 } |
| 3702 | 3717 |
| 3703 // Fall back on calling the memset function. The value operand needs to be | 3718 // Fall back on calling the memset function. The value operand needs to be |
| 3704 // extended to a stack slot size because the PNaCl ABI requires arguments to | 3719 // extended to a stack slot size because the PNaCl ABI requires arguments to |
| 3705 // be at least 32 bits wide. | 3720 // be at least 32 bits wide. |
| 3706 Operand *ValExt; | 3721 Operand *ValExt; |
| 3707 if (IsValConst) { | 3722 if (IsValConst) { |
| 3708 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue); | 3723 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue); |
| (...skipping 1453 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 5162 } | 5177 } |
| 5163 // the offset is not eligible for blinding or pooling, return the original | 5178 // the offset is not eligible for blinding or pooling, return the original |
| 5164 // mem operand | 5179 // mem operand |
| 5165 return MemOperand; | 5180 return MemOperand; |
| 5166 } | 5181 } |
| 5167 | 5182 |
| 5168 } // end of namespace X86Internal | 5183 } // end of namespace X86Internal |
| 5169 } // end of namespace Ice | 5184 } // end of namespace Ice |
| 5170 | 5185 |
| 5171 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H | 5186 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H |
| OLD | NEW |