Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// | 1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==// |
| 2 // | 2 // |
| 3 // The Subzero Code Generator | 3 // The Subzero Code Generator |
| 4 // | 4 // |
| 5 // This file is distributed under the University of Illinois Open Source | 5 // This file is distributed under the University of Illinois Open Source |
| 6 // License. See LICENSE.TXT for details. | 6 // License. See LICENSE.TXT for details. |
| 7 // | 7 // |
| 8 //===----------------------------------------------------------------------===// | 8 //===----------------------------------------------------------------------===// |
| 9 /// | 9 /// |
| 10 /// \file | 10 /// \file |
| (...skipping 3016 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3027 Call->addArg(Instr->getArg(0)); | 3027 Call->addArg(Instr->getArg(0)); |
| 3028 Call->addArg(Instr->getArg(1)); | 3028 Call->addArg(Instr->getArg(1)); |
| 3029 lowerCall(Call); | 3029 lowerCall(Call); |
| 3030 return; | 3030 return; |
| 3031 } | 3031 } |
| 3032 case Intrinsics::Memcpy: { | 3032 case Intrinsics::Memcpy: { |
| 3033 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); | 3033 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); |
| 3034 return; | 3034 return; |
| 3035 } | 3035 } |
| 3036 case Intrinsics::Memmove: { | 3036 case Intrinsics::Memmove: { |
| 3037 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3); | 3037 lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); |
| 3038 Call->addArg(Instr->getArg(0)); | |
| 3039 Call->addArg(Instr->getArg(1)); | |
| 3040 Call->addArg(Instr->getArg(2)); | |
| 3041 lowerCall(Call); | |
| 3042 return; | 3038 return; |
| 3043 } | 3039 } |
| 3044 case Intrinsics::Memset: { | 3040 case Intrinsics::Memset: { |
| 3045 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); | 3041 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2)); |
| 3046 return; | 3042 return; |
| 3047 } | 3043 } |
| 3048 case Intrinsics::NaClReadTP: { | 3044 case Intrinsics::NaClReadTP: { |
| 3049 if (Ctx->getFlags().getUseSandboxing()) { | 3045 if (Ctx->getFlags().getUseSandboxing()) { |
| 3050 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand); | 3046 Operand *Src = dispatchToConcrete(&Machine::createNaClReadTPSrcOperand); |
| 3051 Variable *Dest = Instr->getDest(); | 3047 Variable *Dest = Instr->getDest(); |
| (...skipping 422 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3474 _bsr(T_Dest2, SecondVar); | 3470 _bsr(T_Dest2, SecondVar); |
| 3475 _xor(T_Dest2, ThirtyOne); | 3471 _xor(T_Dest2, ThirtyOne); |
| 3476 } | 3472 } |
| 3477 _test(SecondVar, SecondVar); | 3473 _test(SecondVar, SecondVar); |
| 3478 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e); | 3474 _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e); |
| 3479 _mov(DestLo, T_Dest2); | 3475 _mov(DestLo, T_Dest2); |
| 3480 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); | 3476 _mov(DestHi, Ctx->getConstantZero(IceType_i32)); |
| 3481 } | 3477 } |
| 3482 | 3478 |
| 3483 template <class Machine> | 3479 template <class Machine> |
| 3480 void TargetX86Base<Machine>::typedLoad(Type Ty, Variable *Dest, Variable *Base, | |
| 3481 Constant *Offset) { | |
| 3482 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); | |
| 3483 | |
| 3484 if (isVectorType(Ty)) | |
| 3485 _movp(Dest, Mem); | |
| 3486 else if (Ty == IceType_f64) | |
| 3487 _movq(Dest, Mem); | |
| 3488 else | |
| 3489 _mov(Dest, Mem); | |
| 3490 } | |
| 3491 | |
| 3492 template <class Machine> | |
| 3493 void TargetX86Base<Machine>::typedStore(Type Ty, Variable *Value, | |
| 3494 Variable *Base, Constant *Offset) { | |
| 3495 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); | |
| 3496 | |
| 3497 if (isVectorType(Ty)) | |
| 3498 _storep(Value, Mem); | |
| 3499 else if (Ty == IceType_f64) | |
| 3500 _storeq(Value, Mem); | |
| 3501 else | |
| 3502 _store(Value, Mem); | |
| 3503 } | |
| 3504 | |
| 3505 template <class Machine> | |
| 3506 void TargetX86Base<Machine>::copyMemory(Type Ty, Variable *Dest, Variable *Src, | |
| 3507 int32_t OffsetAmt) { | |
| 3508 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; | |
| 3509 // TODO(ascull): this or add nullptr test to _movp, _movq | |
| 3510 Variable *Data = makeReg(Ty); | |
| 3511 | |
| 3512 typedLoad(Ty, Data, Src, Offset); | |
| 3513 typedStore(Ty, Data, Dest, Offset); | |
| 3514 } | |
| 3515 | |
| 3516 template <class Machine> | |
| 3484 void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src, | 3517 void TargetX86Base<Machine>::lowerMemcpy(Operand *Dest, Operand *Src, |
| 3485 Operand *Count) { | 3518 Operand *Count) { |
| 3486 // There is a load and store for each chunk in the unroll | 3519 // There is a load and store for each chunk in the unroll |
| 3487 constexpr uint32_t UNROLL_LIMIT = 8; | 3520 constexpr uint32_t UNROLL_LIMIT = 8; |
| 3488 constexpr uint32_t BytesPerStorep = 16; | 3521 constexpr uint32_t BytesPerStorep = 16; |
| 3489 constexpr uint32_t BytesPerStoreq = 8; | |
| 3490 constexpr uint32_t BytesPerStorei32 = 4; | |
| 3491 constexpr uint32_t BytesPerStorei16 = 2; | |
| 3492 constexpr uint32_t BytesPerStorei8 = 1; | |
| 3493 | 3522 |
| 3494 // Check if the operands are constants | 3523 // Check if the operands are constants |
| 3495 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); | 3524 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); |
| 3496 const bool IsCountConst = CountConst != nullptr; | 3525 const bool IsCountConst = CountConst != nullptr; |
| 3497 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; | 3526 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; |
| 3498 | 3527 |
| 3499 if (IsCountConst && CountValue <= BytesPerStorep * UNROLL_LIMIT) { | 3528 if (shouldOptimizeMemIntrins() && IsCountConst && |
| 3529 CountValue <= BytesPerStorep * UNROLL_LIMIT) { | |
| 3500 // Unlikely, but nothing to do if it does happen | 3530 // Unlikely, but nothing to do if it does happen |
| 3501 if (CountValue == 0) | 3531 if (CountValue == 0) |
| 3502 return; | 3532 return; |
| 3503 | 3533 |
| 3504 Variable *SrcBase = legalizeToReg(Src); | 3534 Variable *SrcBase = legalizeToReg(Src); |
| 3505 Variable *DestBase = legalizeToReg(Dest); | 3535 Variable *DestBase = legalizeToReg(Dest); |
| 3506 | 3536 |
| 3507 auto lowerCopy = [this, DestBase, SrcBase](Type Ty, uint32_t OffsetAmt) { | 3537 // Find the largest type that can be used and use it as much as possible in |
| 3508 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; | 3538 // reverse order. Then handle any remainder with overlapping copies. Since |
| 3509 // TODO(ascull): this or add nullptr test to _movp, _movq | 3539 // the remainder will be at the end, there will be reduced pressure on the |
| 3510 Variable *Data = makeReg(Ty); | 3540 // memory unit as the accesses to the same memory are far apart. |
| 3541 Type Ty = largestTypeInSize(CountValue); | |
| 3542 uint32_t TyWidth = typeWidthInBytes(Ty); | |
| 3511 | 3543 |
| 3512 // TODO(ascull): is 64-bit better with vector or scalar movq? | 3544 uint32_t RemainingBytes = CountValue; |
| 3513 auto *SrcMem = Traits::X86OperandMem::create(Func, Ty, SrcBase, Offset); | 3545 int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth; |
| 3514 if (isVectorType(Ty)) | 3546 while (RemainingBytes >= TyWidth) { |
| 3515 _movp(Data, SrcMem); | 3547 copyMemory(Ty, DestBase, SrcBase, Offset); |
| 3516 else if (Ty == IceType_f64) | 3548 RemainingBytes -= TyWidth; |
| 3517 _movq(Data, SrcMem); | 3549 Offset -= TyWidth; |
| 3518 else | |
| 3519 _mov(Data, SrcMem); | |
| 3520 | |
| 3521 auto *DestMem = Traits::X86OperandMem::create(Func, Ty, DestBase, Offset); | |
| 3522 if (isVectorType(Ty)) | |
| 3523 _storep(Data, DestMem); | |
| 3524 else if (Ty == IceType_f64) | |
| 3525 _storeq(Data, DestMem); | |
| 3526 else | |
| 3527 _store(Data, DestMem); | |
| 3528 }; | |
| 3529 | |
| 3530 // Lowers the assignment to the remaining bytes. Assumes the original size | |
| 3531 // was large enough to allow for overlaps. | |
| 3532 auto lowerLeftOvers = [this, lowerCopy, CountValue](uint32_t Size) { | |
| 3533 if (Size > BytesPerStoreq) { | |
| 3534 lowerCopy(IceType_v16i8, CountValue - BytesPerStorep); | |
| 3535 } else if (Size > BytesPerStorei32) { | |
| 3536 lowerCopy(IceType_f64, CountValue - BytesPerStoreq); | |
| 3537 } else if (Size > BytesPerStorei16) { | |
| 3538 lowerCopy(IceType_i32, CountValue - BytesPerStorei32); | |
| 3539 } else if (Size > BytesPerStorei8) { | |
| 3540 lowerCopy(IceType_i16, CountValue - BytesPerStorei16); | |
| 3541 } else if (Size == BytesPerStorei8) { | |
| 3542 lowerCopy(IceType_i8, CountValue - BytesPerStorei8); | |
| 3543 } | |
| 3544 }; | |
| 3545 | |
| 3546 if (CountValue >= BytesPerStorep) { | |
| 3547 // Use large vector operations | |
| 3548 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) { | |
| 3549 N -= BytesPerStorep; | |
| 3550 lowerCopy(IceType_v16i8, N); | |
| 3551 } | |
| 3552 lowerLeftOvers(CountValue & 0xF); | |
| 3553 return; | |
| 3554 } | 3550 } |
| 3555 | 3551 |
| 3556 // Too small to use large vector operations so use small ones instead | 3552 if (RemainingBytes == 0) |
| 3557 if (CountValue >= BytesPerStoreq) { | |
| 3558 lowerCopy(IceType_f64, 0); | |
| 3559 lowerLeftOvers(CountValue - BytesPerStoreq); | |
| 3560 return; | 3553 return; |
| 3561 } | |
| 3562 | 3554 |
| 3563 // Too small for vector operations so use scalar ones | 3555 // Lower the remaining bytes. Adjust to larger types in order to make use |
| 3564 if (CountValue >= BytesPerStorei32) { | 3556 // of overlaps in the copies. |
| 3565 lowerCopy(IceType_i32, 0); | 3557 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes); |
| 3566 lowerLeftOvers(CountValue - BytesPerStorei32); | 3558 Offset = CountValue - typeWidthInBytes(LeftOverTy); |
| 3567 return; | 3559 copyMemory(LeftOverTy, DestBase, SrcBase, Offset); |
| 3568 } | |
| 3569 | |
| 3570 // 3 is the awkward size as it is too small for the vector or 32-bit | |
| 3571 // operations and will not work with lowerLeftOvers as there is no valid | |
| 3572 // overlap. | |
| 3573 if (CountValue == 3) { | |
| 3574 lowerCopy(IceType_i16, 0); | |
| 3575 lowerCopy(IceType_i8, 2); | |
| 3576 return; | |
| 3577 } | |
| 3578 | |
| 3579 // 1 or 2 can be done in a single scalar copy | |
| 3580 lowerLeftOvers(CountValue); | |
| 3581 return; | 3560 return; |
| 3582 } | 3561 } |
| 3583 | 3562 |
| 3584 // Fall back on a function call | 3563 // Fall back on a function call |
| 3585 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3); | 3564 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3); |
| 3586 Call->addArg(Dest); | 3565 Call->addArg(Dest); |
| 3587 Call->addArg(Src); | 3566 Call->addArg(Src); |
| 3588 Call->addArg(Count); | 3567 Call->addArg(Count); |
| 3568 lowerCall(Call); | |
| 3569 } | |
| 3570 | |
| 3571 template <class Machine> | |
| 3572 void TargetX86Base<Machine>::lowerMemmove(Operand *Dest, Operand *Src, | |
| 3573 Operand *Count) { | |
| 3574 // There is a load and store for each chunk in the unroll | |
| 3575 constexpr uint32_t UNROLL_LIMIT = 8; // 32-bit has 8 xmm registers | |
|
jvoung (off chromium)
2015/08/20 15:42:53
I'm not sure the exact side effects on register al
Jim Stichnoth
2015/08/20 16:57:04
Could also get the limit from the Machine traits o
ascull
2015/08/20 18:41:56
I put it in traits and left the value as is for no
| |
| 3576 constexpr uint32_t BytesPerStorep = 16; | |
| 3577 | |
| 3578 // Check if the operands are constants | |
| 3579 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); | |
| 3580 const bool IsCountConst = CountConst != nullptr; | |
| 3581 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; | |
| 3582 | |
| 3583 if (shouldOptimizeMemIntrins() && IsCountConst && | |
| 3584 CountValue <= BytesPerStorep * UNROLL_LIMIT) { | |
| 3585 // Unlikely, but nothing to do if it does happen | |
| 3586 if (CountValue == 0) | |
| 3587 return; | |
| 3588 | |
| 3589 Variable *SrcBase = legalizeToReg(Src); | |
| 3590 Variable *DestBase = legalizeToReg(Dest); | |
| 3591 | |
| 3592 std::tuple<Type, Constant *, Variable *> Moves[UNROLL_LIMIT]; | |
| 3593 Constant *Offset; | |
| 3594 Variable *Reg; | |
| 3595 | |
| 3596 // Copy the data into registers as the source and destination could overlap | |
| 3597 // so make sure not to clobber the memory. This also means overlapping moves | |
| 3598 // can be used as we are taking a safe snapshot of the memory. | |
| 3599 Type Ty = largestTypeInSize(CountValue); | |
| 3600 uint32_t TyWidth = typeWidthInBytes(Ty); | |
| 3601 | |
| 3602 uint32_t RemainingBytes = CountValue; | |
| 3603 int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth; | |
| 3604 size_t N = 0; | |
| 3605 while (RemainingBytes >= TyWidth) { | |
| 3606 assert(N <= UNROLL_LIMIT); | |
| 3607 Offset = Ctx->getConstantInt32(OffsetAmt); | |
| 3608 Reg = makeReg(Ty); | |
| 3609 typedLoad(Ty, Reg, SrcBase, Offset); | |
| 3610 RemainingBytes -= TyWidth; | |
| 3611 OffsetAmt -= TyWidth; | |
| 3612 Moves[N++] = std::make_tuple(Ty, Offset, Reg); | |
| 3613 } | |
| 3614 | |
| 3615 if (RemainingBytes != 0) { | |
| 3616 // Lower the remaining bytes. Adjust to larger types in order to make use | |
| 3617 // of overlaps in the copies. | |
| 3618 assert(N <= UNROLL_LIMIT); | |
| 3619 Ty = firstTypeThatFitsSize(RemainingBytes); | |
| 3620 Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty)); | |
| 3621 Reg = makeReg(Ty); | |
| 3622 typedLoad(Ty, Reg, SrcBase, Offset); | |
| 3623 Moves[N++] = std::make_tuple(Ty, Offset, Reg); | |
| 3624 } | |
| 3625 | |
| 3626 // Copy the data out into the destination memory | |
| 3627 for (size_t i = 0; i < N; ++i) { | |
| 3628 std::tie(Ty, Offset, Reg) = Moves[i]; | |
| 3629 typedStore(Ty, Reg, DestBase, Offset); | |
| 3630 } | |
| 3631 | |
| 3632 return; | |
| 3633 } | |
| 3634 | |
| 3635 // Fall back on a function call | |
| 3636 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3); | |
| 3637 Call->addArg(Dest); | |
| 3638 Call->addArg(Src); | |
| 3639 Call->addArg(Count); | |
| 3589 lowerCall(Call); | 3640 lowerCall(Call); |
| 3590 } | 3641 } |
| 3591 | 3642 |
| 3592 template <class Machine> | 3643 template <class Machine> |
| 3593 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, | 3644 void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val, |
| 3594 Operand *Count) { | 3645 Operand *Count) { |
| 3595 constexpr uint32_t UNROLL_LIMIT = 16; | 3646 constexpr uint32_t UNROLL_LIMIT = 16; |
| 3596 constexpr uint32_t BytesPerStorep = 16; | 3647 constexpr uint32_t BytesPerStorep = 16; |
| 3597 constexpr uint32_t BytesPerStoreq = 8; | 3648 constexpr uint32_t BytesPerStoreq = 8; |
| 3598 constexpr uint32_t BytesPerStorei32 = 4; | 3649 constexpr uint32_t BytesPerStorei32 = 4; |
| 3599 constexpr uint32_t BytesPerStorei16 = 2; | |
| 3600 constexpr uint32_t BytesPerStorei8 = 1; | |
| 3601 assert(Val->getType() == IceType_i8); | 3650 assert(Val->getType() == IceType_i8); |
| 3602 | 3651 |
| 3603 // Check if the operands are constants | 3652 // Check if the operands are constants |
| 3604 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); | 3653 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count); |
| 3605 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val); | 3654 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val); |
| 3606 const bool IsCountConst = CountConst != nullptr; | 3655 const bool IsCountConst = CountConst != nullptr; |
| 3607 const bool IsValConst = ValConst != nullptr; | 3656 const bool IsValConst = ValConst != nullptr; |
| 3608 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; | 3657 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0; |
| 3609 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0; | 3658 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0; |
| 3610 | 3659 |
| 3611 // Unlikely, but nothing to do if it does happen | 3660 // Unlikely, but nothing to do if it does happen |
| 3612 if (IsCountConst && CountValue == 0) | 3661 if (IsCountConst && CountValue == 0) |
| 3613 return; | 3662 return; |
| 3614 | 3663 |
| 3615 // TODO(ascull): if the count is constant but val is not it would be possible | 3664 // TODO(ascull): if the count is constant but val is not it would be possible |
| 3616 // to inline by spreading the value across 4 bytes and accessing subregs e.g. | 3665 // to inline by spreading the value across 4 bytes and accessing subregs e.g. |
| 3617 // eax, ax and al. | 3666 // eax, ax and al. |
| 3618 if (IsCountConst && IsValConst) { | 3667 if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) { |
| 3619 Variable *Base = nullptr; | 3668 Variable *Base = nullptr; |
| 3669 Variable *VecReg = nullptr; | |
| 3620 const uint32_t SpreadValue = | 3670 const uint32_t SpreadValue = |
| 3621 (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue; | 3671 (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue; |
| 3622 Variable *VecReg = nullptr; | |
| 3623 | 3672 |
| 3624 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty, | 3673 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty, |
| 3625 uint32_t OffsetAmt) { | 3674 uint32_t OffsetAmt) { |
| 3626 assert(Base != nullptr); | 3675 assert(Base != nullptr); |
| 3627 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; | 3676 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr; |
| 3628 | 3677 |
| 3629 // TODO(ascull): is 64-bit better with vector or scalar movq? | 3678 // TODO(ascull): is 64-bit better with vector or scalar movq? |
| 3630 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); | 3679 auto *Mem = Traits::X86OperandMem::create(Func, Ty, Base, Offset); |
| 3631 if (isVectorType(Ty)) { | 3680 if (isVectorType(Ty)) { |
| 3632 assert(VecReg != nullptr); | 3681 assert(VecReg != nullptr); |
| 3633 _storep(VecReg, Mem); | 3682 _storep(VecReg, Mem); |
| 3634 } else if (Ty == IceType_i64) { | 3683 } else if (Ty == IceType_f64) { |
| 3635 assert(VecReg != nullptr); | 3684 assert(VecReg != nullptr); |
| 3636 _storeq(VecReg, Mem); | 3685 _storeq(VecReg, Mem); |
| 3637 } else { | 3686 } else { |
| 3638 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem); | 3687 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem); |
| 3639 } | 3688 } |
| 3640 }; | 3689 }; |
| 3641 | 3690 |
| 3642 // Lowers the assignment to the remaining bytes. Assumes the original size | 3691 // Find the largest type that can be used and use it as much as possible in |
| 3643 // was large enough to allow for overlaps. | 3692 // reverse order. Then handle any remainder with overlapping copies. Since |
| 3644 auto lowerLeftOvers = [this, lowerSet, CountValue](uint32_t Size) { | 3693 // the remainder will be at the end, there will be reduces pressure on the |
| 3645 if (Size > BytesPerStoreq) { | 3694 // memory unit as the access to the same memory are far apart. |
| 3646 lowerSet(IceType_v16i8, CountValue - BytesPerStorep); | 3695 Type Ty; |
| 3647 } else if (Size > BytesPerStorei32) { | |
| 3648 lowerSet(IceType_i64, CountValue - BytesPerStoreq); | |
| 3649 } else if (Size > BytesPerStorei16) { | |
| 3650 lowerSet(IceType_i32, CountValue - BytesPerStorei32); | |
| 3651 } else if (Size > BytesPerStorei8) { | |
| 3652 lowerSet(IceType_i16, CountValue - BytesPerStorei16); | |
| 3653 } else if (Size == BytesPerStorei8) { | |
| 3654 lowerSet(IceType_i8, CountValue - BytesPerStorei8); | |
| 3655 } | |
| 3656 }; | |
| 3657 | |
| 3658 // When the value is zero it can be loaded into a vector register cheaply | |
| 3659 // using the xor trick. | |
| 3660 if (ValValue == 0 && CountValue >= BytesPerStoreq && | 3696 if (ValValue == 0 && CountValue >= BytesPerStoreq && |
| 3661 CountValue <= BytesPerStorep * UNROLL_LIMIT) { | 3697 CountValue <= BytesPerStorep * UNROLL_LIMIT) { |
| 3698 // When the value is zero it can be loaded into a vector register cheaply | |
| 3699 // using the xor trick. | |
| 3662 Base = legalizeToReg(Dest); | 3700 Base = legalizeToReg(Dest); |
| 3663 VecReg = makeVectorOfZeros(IceType_v16i8); | 3701 VecReg = makeVectorOfZeros(IceType_v16i8); |
| 3702 Ty = largestTypeInSize(CountValue); | |
| 3703 } else if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) { | |
| 3704 // When the value is non-zero or the count is small we can't use vector | |
| 3705 // instructions so are limited to 32-bit stores. | |
| 3706 Base = legalizeToReg(Dest); | |
| 3707 constexpr uint32_t MaxSize = 4; | |
| 3708 Ty = largestTypeInSize(CountValue, MaxSize); | |
| 3709 } | |
| 3664 | 3710 |
| 3665 // Too small to use large vector operations so use small ones instead | 3711 if (Base) { |
| 3666 if (CountValue < BytesPerStorep) { | 3712 uint32_t TyWidth = typeWidthInBytes(Ty); |
| 3667 lowerSet(IceType_i64, 0); | 3713 |
| 3668 lowerLeftOvers(CountValue - BytesPerStoreq); | 3714 uint32_t RemainingBytes = CountValue; |
| 3669 return; | 3715 uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth; |
| 3716 while (RemainingBytes >= TyWidth) { | |
| 3717 lowerSet(Ty, Offset); | |
| 3718 RemainingBytes -= TyWidth; | |
| 3719 Offset -= TyWidth; | |
| 3670 } | 3720 } |
| 3671 | 3721 |
| 3672 // Use large vector operations | 3722 if (RemainingBytes == 0) |
| 3673 for (uint32_t N = CountValue & 0xFFFFFFF0; N != 0;) { | 3723 return; |
| 3674 N -= 16; | |
| 3675 lowerSet(IceType_v16i8, N); | |
| 3676 } | |
| 3677 lowerLeftOvers(CountValue & 0xF); | |
| 3678 return; | |
| 3679 } | |
| 3680 | 3724 |
| 3681 // TODO(ascull): load val into reg and select subregs e.g. eax, ax, al? | 3725 // Lower the remaining bytes. Adjust to larger types in order to make use |
| 3682 if (CountValue <= BytesPerStorei32 * UNROLL_LIMIT) { | 3726 // of overlaps in the copies. |
| 3683 Base = legalizeToReg(Dest); | 3727 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes); |
| 3684 // 3 is the awkward size as it is too small for the vector or 32-bit | 3728 Offset = CountValue - typeWidthInBytes(LeftOverTy); |
| 3685 // operations and will not work with lowerLeftOvers as there is no valid | 3729 lowerSet(LeftOverTy, Offset); |
| 3686 // overlap. | |
| 3687 if (CountValue == 3) { | |
| 3688 lowerSet(IceType_i16, 0); | |
| 3689 lowerSet(IceType_i8, 2); | |
| 3690 return; | |
| 3691 } | |
| 3692 | |
| 3693 // TODO(ascull); 64-bit can do better with 64-bit mov | |
| 3694 for (uint32_t N = CountValue & 0xFFFFFFFC; N != 0;) { | |
| 3695 N -= 4; | |
| 3696 lowerSet(IceType_i32, N); | |
| 3697 } | |
| 3698 lowerLeftOvers(CountValue & 0x3); | |
| 3699 return; | 3730 return; |
| 3700 } | 3731 } |
| 3701 } | 3732 } |
| 3702 | 3733 |
| 3703 // Fall back on calling the memset function. The value operand needs to be | 3734 // Fall back on calling the memset function. The value operand needs to be |
| 3704 // extended to a stack slot size because the PNaCl ABI requires arguments to | 3735 // extended to a stack slot size because the PNaCl ABI requires arguments to |
| 3705 // be at least 32 bits wide. | 3736 // be at least 32 bits wide. |
| 3706 Operand *ValExt; | 3737 Operand *ValExt; |
| 3707 if (IsValConst) { | 3738 if (IsValConst) { |
| 3708 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue); | 3739 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue); |
| (...skipping 1200 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 4909 // There aren't any 64-bit integer registers for x86-32. | 4940 // There aren't any 64-bit integer registers for x86-32. |
| 4910 assert(Type != IceType_i64); | 4941 assert(Type != IceType_i64); |
| 4911 Variable *Reg = Func->makeVariable(Type); | 4942 Variable *Reg = Func->makeVariable(Type); |
| 4912 if (RegNum == Variable::NoRegister) | 4943 if (RegNum == Variable::NoRegister) |
| 4913 Reg->setWeightInfinite(); | 4944 Reg->setWeightInfinite(); |
| 4914 else | 4945 else |
| 4915 Reg->setRegNum(RegNum); | 4946 Reg->setRegNum(RegNum); |
| 4916 return Reg; | 4947 return Reg; |
| 4917 } | 4948 } |
| 4918 | 4949 |
| 4950 template <class Machine> | |
| 4951 const Type TargetX86Base<Machine>::TypeForSize[] = { | |
| 4952 IceType_i8, IceType_i16, IceType_i32, | |
| 4953 (Traits::Is64Bit ? IceType_i64 : IceType_f64), IceType_v16i8}; | |
| 4954 template <class Machine> | |
| 4955 Type TargetX86Base<Machine>::largestTypeInSize(uint32_t Size, | |
| 4956 uint32_t MaxSize) { | |
| 4957 assert(Size != 0); | |
| 4958 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined); | |
| 4959 uint32_t MaxIndex = MaxSize != NoSizeLimit | |
|
Jim Stichnoth
2015/08/20 16:57:04
I think it's generally easier to read if you minim
ascull
2015/08/20 18:41:56
Done.
| |
| 4960 ? llvm::findLastSet(MaxSize, llvm::ZB_Undefined) | |
| 4961 : llvm::array_lengthof(TypeForSize) - 1; | |
| 4962 return TypeForSize[std::min(TyIndex, MaxIndex)]; | |
| 4963 } | |
| 4964 | |
| 4965 template <class Machine> | |
| 4966 Type TargetX86Base<Machine>::firstTypeThatFitsSize(uint32_t Size, | |
| 4967 uint32_t MaxSize) { | |
| 4968 assert(Size != 0); | |
| 4969 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined); | |
| 4970 if (!llvm::isPowerOf2_32(Size)) | |
| 4971 ++TyIndex; | |
| 4972 uint32_t MaxIndex = MaxSize != NoSizeLimit | |
| 4973 ? llvm::findLastSet(MaxSize, llvm::ZB_Undefined) | |
| 4974 : llvm::array_lengthof(TypeForSize) - 1; | |
| 4975 return TypeForSize[std::min(TyIndex, MaxIndex)]; | |
| 4976 } | |
| 4977 | |
| 4919 template <class Machine> void TargetX86Base<Machine>::postLower() { | 4978 template <class Machine> void TargetX86Base<Machine>::postLower() { |
| 4920 if (Ctx->getFlags().getOptLevel() == Opt_m1) | 4979 if (Ctx->getFlags().getOptLevel() == Opt_m1) |
| 4921 return; | 4980 return; |
| 4922 inferTwoAddress(); | 4981 inferTwoAddress(); |
| 4923 } | 4982 } |
| 4924 | 4983 |
| 4925 template <class Machine> | 4984 template <class Machine> |
| 4926 void TargetX86Base<Machine>::makeRandomRegisterPermutation( | 4985 void TargetX86Base<Machine>::makeRandomRegisterPermutation( |
| 4927 llvm::SmallVectorImpl<int32_t> &Permutation, | 4986 llvm::SmallVectorImpl<int32_t> &Permutation, |
| 4928 const llvm::SmallBitVector &ExcludeRegisters) const { | 4987 const llvm::SmallBitVector &ExcludeRegisters) const { |
| (...skipping 233 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 5162 } | 5221 } |
| 5163 // the offset is not eligible for blinding or pooling, return the original | 5222 // the offset is not eligible for blinding or pooling, return the original |
| 5164 // mem operand | 5223 // mem operand |
| 5165 return MemOperand; | 5224 return MemOperand; |
| 5166 } | 5225 } |
| 5167 | 5226 |
| 5168 } // end of namespace X86Internal | 5227 } // end of namespace X86Internal |
| 5169 } // end of namespace Ice | 5228 } // end of namespace Ice |
| 5170 | 5229 |
| 5171 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H | 5230 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H |
| OLD | NEW |