Chromium Code Reviews| Index: src/IceTargetLoweringX86BaseImpl.h |
| diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h |
| index bb26090629253933c550f046e073b7b14c076ef3..cf2af4822e8455b09104768ded2ee31ad4ee486f 100644 |
| --- a/src/IceTargetLoweringX86BaseImpl.h |
| +++ b/src/IceTargetLoweringX86BaseImpl.h |
| @@ -3034,11 +3034,7 @@ void TargetX86Base<Machine>::lowerIntrinsicCall( |
| return; |
| } |
| case Intrinsics::Memmove: { |
| - InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3); |
| - Call->addArg(Instr->getArg(0)); |
| - Call->addArg(Instr->getArg(1)); |
| - Call->addArg(Instr->getArg(2)); |
| - lowerCall(Call); |
| + lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); |
| return; |
| } |
| case Intrinsics::Memset: { |
| @@ -3481,15 +3477,50 @@ void TargetX86Base<Machine>::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, |
| } |
| template <class Machine> |
| +Type TargetX86Base<Machine>::typeForSize(uint32_t Size, bool Overflow, |
|
Jim Stichnoth
2015/08/12 13:41:47
Is there anything machine-specific about this func
ascull
2015/08/17 22:18:52
Possibly, as we discussed.
|
| + uint32_t LimitWidth) { |
| + assert(Size != 0); |
| + static const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32, |
| + IceType_f64, IceType_v16i8}; |
|
John
2015/08/12 17:44:57
See if
Traits::Is64Bit ? IceType_i64 : IceType_f6
ascull
2015/08/17 22:18:53
That does work. And it sorts on of the TODOs, than
|
| + uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined); |
| + if (Overflow && !llvm::isPowerOf2_32(Size)) |
| + ++TyIndex; |
| + uint32_t MaxIndex = |
| + LimitWidth ? llvm::findLastSet(LimitWidth, llvm::ZB_Undefined) : 4; |
|
Jim Stichnoth
2015/08/12 13:41:47
Can you use llvm::array_lengthof(TypeForSize)-1 in
ascull
2015/08/17 22:18:52
This is the function I tried and failed to find, T
|
| + return TypeForSize[std::min(TyIndex, MaxIndex)]; |
| +} |
| + |
| +template <class Machine> |
| +void TargetX86Base<Machine>::lowerCopyMem(Type Ty, Variable *Dest, |
| + Variable *Src, uint32_t OffsetAmt) { |
| + Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; |
| + // TODO(ascull): this or add nullptr test to _movp, _movq |
| + Variable *Data = makeReg(Ty); |
| + |
| + // TODO(ascull): is 64-bit better with vector or scalar movq? |
| + auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, Src, Offset); |
| + if (isVectorType(Ty)) |
| + _movp(Data, SrcMem); |
| + else if (Ty == IceType_f64) |
| + _movq(Data, SrcMem); |
| + else |
| + _mov(Data, SrcMem); |
| + |
| + auto *DestMem = Traits::X86OperandMem::create(Func, Ty, Dest, Offset); |
| + if (isVectorType(Ty)) |
| + _storep(Data, DestMem); |
|
John
2015/08/12 17:44:57
just curious: why store[pq]\? instead of mov?
you
ascull
2015/08/17 22:18:53
_store seems to be needed for writing to memory. I
|
| + else if (Ty == IceType_f64) |
| + _storeq(Data, DestMem); |
| + else |
| + _store(Data, DestMem); |
| +} |
| + |
| +template <class Machine> |
| void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src, |
| Operand *Count) { |
| // There is a load and store for each chunk in the unroll |
| constexpr uint32_t UNROLL_LIMIT = 8; |
| constexpr uint32_t BytesPerStorep = 16; |
| - constexpr uint32_t BytesPerStoreq = 8; |
| - constexpr uint32_t BytesPerStorei32 = 4; |
| - constexpr uint32_t BytesPerStorei16 = 2; |
| - constexpr uint32_t BytesPerStorei8 = 1; |
| // Check if the operands are constants |
| const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); |
| @@ -3504,85 +3535,87 @@ void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src, |
| Variable *SrcBase = legalizeToReg(Src); |
| Variable *DestBase = legalizeToReg(Dest); |
| - auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) { |
| - Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; |
| - // TODO(ascull): this or add nullptr test to _movp, _movq |
| - Variable *Data = makeReg(Ty); |
| + // Find the largest type that can be used and use it as much as possible in |
| + // reverse order. Then handle any remainder with overlapping copies. Since |
| + // the remainder will be at the end, there will be reduces pressure on the |
|
Jim Stichnoth
2015/08/12 13:41:47
reduced (I think)
ascull
2015/08/17 22:18:53
Done.
|
| + // memory unit as the access to the same memory are far apart. |
|
Jim Stichnoth
2015/08/12 13:41:47
accesses ?
ascull
2015/08/17 22:18:53
Done.
|
| + Type Ty = typeForSize(CountValue); |
| + uint32_t TyWidth = typeWidthInBytes(Ty); |
| + |
| + uint32_t RemainingBytes = CountValue; |
|
John
2015/08/12 17:44:57
do you really need uint32_t's here? unless you rea
ascull
2015/08/17 22:18:53
The CountValue comes from a size_t so is unsigned.
|
| + uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth; |
| + while (RemainingBytes >= TyWidth) { |
| + lowerCopyMem(Ty, DestBase, SrcBase, Offset); |
| + RemainingBytes -= TyWidth; |
| + Offset -= TyWidth; |
| + } |
| - // TODO(ascull): is 64-bit better with vector or scalar movq? |
| - auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset); |
| - if (isVectorType(Ty)) |
| - _movp(Data, SrcMem); |
| - else if (Ty == IceType_f64) |
| - _movq(Data, SrcMem); |
| - else |
| - _mov(Data, SrcMem); |
| + if (RemainingBytes == 0) |
| + return; |
| - auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset); |
| - if (isVectorType(Ty)) |
| - _storep(Data, DestMem); |
| - else if (Ty == IceType_f64) |
| - _storeq(Data, DestMem); |
| - else |
| - _store(Data, DestMem); |
| - }; |
| + // Lower the remaining bytes. Adjust to larger types in order to make use |
| + // of overlaps in the copies. |
| + constexpr bool Overflow = true; |
| + Type LeftOverTy = typeForSize(RemainingBytes, Overflow); |
| + Offset = CountValue - typeWidthInBytes(LeftOverTy); |
|
John
2015/08/12 17:44:57
be careful with unaligned memory accesses. it migh
ascull
2015/08/17 22:18:53
This is the way gcc and clang do it. Since the ali
|
| + lowerCopyMem(LeftOverTy, DestBase, SrcBase, Offset); |
| + return; |
| + } |
| - // Lowers the assignment to the remaining bytes. Assumes the original size |
| - // was large enough to allow for overlaps. |
| - auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) { |
| - if (Size > BytesPerStoreq) { |
| - lowerCopy(IceType_v16i8, CountValue - BytesPerStorep); |
| - } else if (Size > BytesPerStorei32) { |
| - lowerCopy(IceType_f64, CountValue - BytesPerStoreq); |
| - } else if (Size > BytesPerStorei16) { |
| - lowerCopy(IceType_i32, CountValue - BytesPerStorei32); |
| - } else if (Size > BytesPerStorei8) { |
| - lowerCopy(IceType_i16, CountValue - BytesPerStorei16); |
| - } else if (Size == BytesPerStorei8) { |
| - lowerCopy(IceType_i8, CountValue - BytesPerStorei8); |
| - } |
| - }; |
| + // Fall back on a function call |
| + InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3); |
| + Call->addArg(Dest); |
| + Call->addArg(Src); |
| + Call->addArg(Count); |
| + lowerCall(Call); |
| +} |
| - if (CountValue >= BytesPerStorep) { |
| - // Use large vector operations |
| - for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) { |
| - N -= BytesPerStorep; |
| - lowerCopy(IceType_v16i8, N); |
| - } |
| - lowerLeftOvers(CountValue & 0xF); |
| - return; |
| - } |
| +template <class Machine> |
| +void TargetX86Base<Machine>::lowerMemmove(Operand *Dest, Operand *Src, |
| + Operand *Count) { |
| + // There is a load and store for each chunk in the unroll |
| + constexpr uint32_t UNROLL_LIMIT = 8; |
| + constexpr uint32_t BytesPerStorep = 16; |
| - // Too small to use large vector operations so use small ones instead |
| - if (CountValue >= BytesPerStoreq) { |
| - lowerCopy(IceType_f64, 0); |
| - lowerLeftOvers(CountValue - BytesPerStoreq); |
| - return; |
| - } |
| + // Check if the operands are constants |
| + const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); |
| + const bool IsCountConst = CountConst != nullptr; |
| + const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; |
| - // Too small for vector operations so use scalar ones |
| - if (CountValue >= BytesPerStorei32) { |
| - lowerCopy(IceType_i32, 0); |
| - lowerLeftOvers(CountValue - BytesPerStorei32); |
| + if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) { |
| + // Unlikely, but nothing to do if it does happen |
| + if (CountValue == 0) |
| return; |
| - } |
| - // 3 is the awkward size as it is too small for the vector or 32-bit |
| - // operations and will not work with lowerLeftOvers as there is no valid |
| - // overlap. |
| - if (CountValue == 3) { |
| - lowerCopy(IceType_i16, 0); |
| - lowerCopy(IceType_i8, 2); |
| - return; |
| + Variable *SrcBase = legalizeToReg(Src); |
| + Variable *DestBase = legalizeToReg(Dest); |
| + |
| + // Make sure to only copy down to avoid overlap issues. This means Src must |
| + // have a greater address than Dest. |
| + _cmp(SrcBase, DestBase); |
| + typename Traits::Insts::Label *Label = |
| + Traits::Insts::Label::create(Func, this); |
| + _br(Traits::Cond::Br_b, Label); |
| + _xchg(SrcBase, DestBase); |
| + Context.insert(Label); |
| + |
| + // We can't assume overlapping copies are safe as the copy may overwrite |
| + // the source. This means we need to repeatedly select the largest size to |
| + // copy that doesn't cause overlap. |
| + uint32_t Size = CountValue; |
| + uint32_t Offset = 0; |
| + while (Size > 0) { |
| + Type Ty = typeForSize(Size); |
| + lowerCopyMem(Ty, DestBase, SrcBase, Offset); |
| + Size -= typeWidthInBytes(Ty); |
| + Offset += typeWidthInBytes(Ty); |
| } |
| - // 1 or 2 can be done in a single scalar copy |
| - lowerLeftOvers(CountValue); |
| return; |
| } |
| // Fall back on a function call |
| - InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3); |
| + InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3); |
| Call->addArg(Dest); |
| Call->addArg(Src); |
| Call->addArg(Count); |
| @@ -3596,8 +3629,6 @@ void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, |
| constexpr uint32_t BytesPerStorep = 16; |
| constexpr uint32_t BytesPerStoreq = 8; |
| constexpr uint32_t BytesPerStorei32 = 4; |
| - constexpr uint32_t BytesPerStorei16 = 2; |
| - constexpr uint32_t BytesPerStorei8 = 1; |
| assert(Val->getType() == IceType_i8); |
| // Check if the operands are constants |
| @@ -3617,12 +3648,12 @@ void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, |
| // eax, ax and al. |
| if (IsCountConst && IsValConst) { |
| Variable *Base = nullptr; |
| + Variable *VecReg = nullptr; |
| const uint32_t SpreadValue = |
| (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue; |
| - Variable *VecReg = nullptr; |
| auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty, |
| - uint32_t OffsetAmt) { |
| + uint32_t OffsetAmt) { |
| assert(Base != nullptr); |
| Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; |
| @@ -3631,7 +3662,7 @@ void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, |
| if (isVectorType(Ty)) { |
| assert(VecReg != nullptr); |
| _storep(VecReg, Mem); |
| - } else if (Ty == IceType_i64) { |
| + } else if (Ty == IceType_f64) { |
| assert(VecReg != nullptr); |
| _storeq(VecReg, Mem); |
| } else { |
| @@ -3639,63 +3670,47 @@ void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, |
| } |
| }; |
| - // Lowers the assignment to the remaining bytes. Assumes the original size |
| - // was large enough to allow for overlaps. |
| - auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) { |
| - if (Size > BytesPerStoreq) { |
| - lowerSet(IceType_v16i8, CountValue - BytesPerStorep); |
| - } else if (Size > BytesPerStorei32) { |
| - lowerSet(IceType_i64, CountValue - BytesPerStoreq); |
| - } else if (Size > BytesPerStorei16) { |
| - lowerSet(IceType_i32, CountValue - BytesPerStorei32); |
| - } else if (Size > BytesPerStorei8) { |
| - lowerSet(IceType_i16, CountValue - BytesPerStorei16); |
| - } else if (Size == BytesPerStorei8) { |
| - lowerSet(IceType_i8, CountValue - BytesPerStorei8); |
| - } |
| - }; |
| - |
| - // When the value is zero it can be loaded into a vector register cheaply |
| - // using the xor trick. |
| + // Find the largest type that can be used and use it as much as possible in |
| + // reverse order. Then handle any remainder with overlapping copies. Since |
| + // the remainder will be at the end, there will be reduces pressure on the |
| + // memory unit as the access to the same memory are far apart. |
| + Type Ty; |
| if (ValValue == 0 && CountValue >= BytesPerStoreq && |
| CountValue <= BytesPerStorep * UNROLL_LIMIT) { |
| + // When the value is zero it can be loaded into a vector register cheaply |
| + // using the xor trick. |
| Base = legalizeToReg(Dest); |
| VecReg = makeVectorOfZeros(IceType_v16i8); |
| + Ty = typeForSize(CountValue); |
| + } else if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) { |
| + // When the value is non-zero or the count is small we can't use vector |
| + // instructions so are limits to 32-bit stores. |
| + Base = legalizeToReg(Dest); |
| + constexpr bool Offset = false; |
| + constexpr uint32_t LimitWidth = 4; |
| + Ty = typeForSize(CountValue, Offset, LimitWidth); |
| + } |
| - // Too small to use large vector operations so use small ones instead |
| - if (CountValue < BytesPerStorep) { |
| - lowerSet(IceType_i64, 0); |
| - lowerLeftOvers(CountValue - BytesPerStoreq); |
| - return; |
| - } |
| - |
| - // Use large vector operations |
| - for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) { |
| - N -= 16; |
| - lowerSet(IceType_v16i8, N); |
| + if (Base) { |
| + uint32_t TyWidth = typeWidthInBytes(Ty); |
| + |
| + uint32_t RemainingBytes = CountValue; |
| + uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth; |
| + while (RemainingBytes >= TyWidth) { |
| + lowerSet(Ty, Offset); |
| + RemainingBytes -= TyWidth; |
| + Offset -= TyWidth; |
| } |
| - lowerLeftOvers(CountValue & 0xF); |
| - return; |
| - } |
| - // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al? |
| - if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) { |
| - Base = legalizeToReg(Dest); |
| - // 3 is the awkward size as it is too small for the vector or 32-bit |
| - // operations and will not work with lowerLeftOvers as there is no valid |
| - // overlap. |
| - if (CountValue == 3) { |
| - lowerSet(IceType_i16, 0); |
| - lowerSet(IceType_i8, 2); |
| + if (RemainingBytes == 0) |
| return; |
| - } |
| - // TODO(ascull); 64-bit can do better with 64-bit mov |
| - for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) { |
| - N -= 4; |
| - lowerSet(IceType_i32, N); |
| - } |
| - lowerLeftOvers(CountValue & 0x3); |
| + // Lower the remaining bytes. Adjust to larger types in order to make use |
| + // of overlaps in the copies. |
| + constexpr bool Overflow = true; |
| + Type LeftOverTy = typeForSize(RemainingBytes, Overflow); |
| + Offset = CountValue - typeWidthInBytes(LeftOverTy); |
| + lowerSet(LeftOverTy, Offset); |
| return; |
| } |
| } |