| Index: src/IceTargetLoweringX86BaseImpl.h
|
| diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
|
| index 35d7ea047e9c70eaa2e069af2f51217b388ff6cc..79b54774b674ccefd84ed667f22dd324f2674e1b 100644
|
| --- a/src/IceTargetLoweringX86BaseImpl.h
|
| +++ b/src/IceTargetLoweringX86BaseImpl.h
|
| @@ -5675,6 +5675,99 @@ inline SizeT makeSrcSwitchMask(SizeT Index0, SizeT Index1, SizeT Index2,
|
| }
|
|
|
| template <typename TraitsType>
|
| +GlobalString TargetX86Base<TraitsType>::lowerShuffleVector_NewMaskName() {
|
| + GlobalString FuncName = Func->getFunctionName();
|
| + const SizeT Id = PshufbMaskCount++;
|
| + if (!BuildDefs::dump() || !FuncName.hasStdString()) {
|
| + return GlobalString::createWithString(
|
| + Ctx,
|
| + "$PS" + std::to_string(FuncName.getID()) + "_" + std::to_string(Id));
|
| + }
|
| + return GlobalString::createWithString(
|
| + Ctx, "Pshufb$" + Func->getFunctionName() + "$" + std::to_string(Id));
|
| +}
|
| +
|
| +template <typename TraitsType>
|
| +ConstantRelocatable *
|
| +TargetX86Base<TraitsType>::lowerShuffleVector_CreatePshufbMask(
|
| + int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
|
| + int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
|
| + int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
|
| + int8_t Idx15) {
|
| + static constexpr uint8_t NumElements = 16;
|
| + const char Initializer[NumElements] = {
|
| + Idx0, Idx1, Idx2, Idx3, Idx4, Idx5, Idx6, Idx7,
|
| + Idx8, Idx9, Idx10, Idx11, Idx12, Idx13, Idx14, Idx15,
|
| + };
|
| +
|
| + static constexpr Type V4VectorType = IceType_v4i32;
|
| + const uint32_t MaskAlignment = typeWidthInBytesOnStack(V4VectorType);
|
| + auto *Mask = VariableDeclaration::create(Func->getGlobalPool());
|
| + GlobalString MaskName = lowerShuffleVector_NewMaskName();
|
| + Mask->setIsConstant(true);
|
| + Mask->addInitializer(VariableDeclaration::DataInitializer::create(
|
| + Func->getGlobalPool(), Initializer, NumElements));
|
| + Mask->setName(MaskName);
|
| + // Mask needs to be 16-byte aligned, or pshufb will seg fault.
|
| + Mask->setAlignment(MaskAlignment);
|
| + Func->addGlobal(Mask);
|
| +
|
| + constexpr RelocOffsetT Offset = 0;
|
| + return llvm::cast<ConstantRelocatable>(Ctx->getConstantSym(Offset, MaskName));
|
| +}
|
| +
|
| +template <typename TraitsType>
|
| +void TargetX86Base<TraitsType>::lowerShuffleVector_UsingPshufb(
|
| + Variable *Dest, Operand *Src0, Operand *Src1, int8_t Idx0, int8_t Idx1,
|
| + int8_t Idx2, int8_t Idx3, int8_t Idx4, int8_t Idx5, int8_t Idx6,
|
| + int8_t Idx7, int8_t Idx8, int8_t Idx9, int8_t Idx10, int8_t Idx11,
|
| + int8_t Idx12, int8_t Idx13, int8_t Idx14, int8_t Idx15) {
|
| + const Type DestTy = Dest->getType();
|
| + static constexpr bool NotRebased = false;
|
| + static constexpr Variable *NoBase = nullptr;
|
| + // We use void for the memory operand instead of DestTy because using the
|
| + // latter causes a validation failure: the X86 Inst layer complains that
|
| + // vector mem operands could be under aligned. Thus, using void we avoid the
|
| + // validation error. Note that the mask global declaration is aligned, so it
|
| + // can be used as an XMM mem operand.
|
| + static constexpr Type MaskType = IceType_void;
|
| +#define IDX_IN_SRC(N, S) \
|
| + ((((N) & (1 << 4)) == (S << 4)) ? ((N)&0xf) : CLEAR_ALL_BITS)
|
| + auto *Mask0M = X86OperandMem::create(
|
| + Func, MaskType, NoBase,
|
| + lowerShuffleVector_CreatePshufbMask(
|
| + IDX_IN_SRC(Idx0, 0), IDX_IN_SRC(Idx1, 0), IDX_IN_SRC(Idx2, 0),
|
| + IDX_IN_SRC(Idx3, 0), IDX_IN_SRC(Idx4, 0), IDX_IN_SRC(Idx5, 0),
|
| + IDX_IN_SRC(Idx6, 0), IDX_IN_SRC(Idx7, 0), IDX_IN_SRC(Idx8, 0),
|
| + IDX_IN_SRC(Idx9, 0), IDX_IN_SRC(Idx10, 0), IDX_IN_SRC(Idx11, 0),
|
| + IDX_IN_SRC(Idx12, 0), IDX_IN_SRC(Idx13, 0), IDX_IN_SRC(Idx14, 0),
|
| + IDX_IN_SRC(Idx15, 0)),
|
| + NotRebased);
|
| + auto *Mask1M = X86OperandMem::create(
|
| + Func, MaskType, NoBase,
|
| + lowerShuffleVector_CreatePshufbMask(
|
| + IDX_IN_SRC(Idx0, 1), IDX_IN_SRC(Idx1, 1), IDX_IN_SRC(Idx2, 1),
|
| + IDX_IN_SRC(Idx3, 1), IDX_IN_SRC(Idx4, 1), IDX_IN_SRC(Idx5, 1),
|
| + IDX_IN_SRC(Idx6, 1), IDX_IN_SRC(Idx7, 1), IDX_IN_SRC(Idx8, 1),
|
| + IDX_IN_SRC(Idx9, 1), IDX_IN_SRC(Idx10, 1), IDX_IN_SRC(Idx11, 1),
|
| + IDX_IN_SRC(Idx12, 1), IDX_IN_SRC(Idx13, 1), IDX_IN_SRC(Idx14, 1),
|
| + IDX_IN_SRC(Idx15, 1)),
|
| + NotRebased);
|
| +#undef IDX_IN_SRC
|
| + auto *T0 = makeReg(DestTy);
|
| + auto *T1 = makeReg(DestTy);
|
| + auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
|
| + _movp(T0, Src0RM);
|
| + auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
|
| + _movp(T1, Src1RM);
|
| +
|
| + _pshufb(T1, Mask1M);
|
| + _pshufb(T0, Mask0M);
|
| + _por(T1, T0);
|
| + _movp(Dest, T1);
|
| +}
|
| +
|
| +template <typename TraitsType>
|
| void TargetX86Base<TraitsType>::lowerShuffleVector(
|
| const InstShuffleVector *Instr) {
|
| auto *Dest = Instr->getDest();
|
| @@ -5687,9 +5780,68 @@ void TargetX86Base<TraitsType>::lowerShuffleVector(
|
|
|
| switch (DestTy) {
|
| default:
|
| - break;
|
| - // TODO(jpp): figure out how to properly lower the remaining cases without
|
| - // scalarization.
|
| + llvm::report_fatal_error("Unexpected vector type.");
|
| + case IceType_v16i1:
|
| + case IceType_v16i8: {
|
| + if (InstructionSet < Traits::SSE4_1) {
|
| + // TODO(jpp): figure out how to lower with sse2.
|
| + break;
|
| + }
|
| + static constexpr SizeT ExpectedNumElements = 16;
|
| + assert(ExpectedNumElements == Instr->getNumIndexes());
|
| + (void)ExpectedNumElements;
|
| + const SizeT Index0 = Instr->getIndex(0)->getValue();
|
| + const SizeT Index1 = Instr->getIndex(1)->getValue();
|
| + const SizeT Index2 = Instr->getIndex(2)->getValue();
|
| + const SizeT Index3 = Instr->getIndex(3)->getValue();
|
| + const SizeT Index4 = Instr->getIndex(4)->getValue();
|
| + const SizeT Index5 = Instr->getIndex(5)->getValue();
|
| + const SizeT Index6 = Instr->getIndex(6)->getValue();
|
| + const SizeT Index7 = Instr->getIndex(7)->getValue();
|
| + const SizeT Index8 = Instr->getIndex(8)->getValue();
|
| + const SizeT Index9 = Instr->getIndex(9)->getValue();
|
| + const SizeT Index10 = Instr->getIndex(10)->getValue();
|
| + const SizeT Index11 = Instr->getIndex(11)->getValue();
|
| + const SizeT Index12 = Instr->getIndex(12)->getValue();
|
| + const SizeT Index13 = Instr->getIndex(13)->getValue();
|
| + const SizeT Index14 = Instr->getIndex(14)->getValue();
|
| + const SizeT Index15 = Instr->getIndex(15)->getValue();
|
| + lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
|
| + Index3, Index4, Index5, Index6, Index7,
|
| + Index8, Index9, Index10, Index11, Index12,
|
| + Index13, Index14, Index15);
|
| + return;
|
| + }
|
| + case IceType_v8i1:
|
| + case IceType_v8i16: {
|
| + if (InstructionSet < Traits::SSE4_1) {
|
| + // TODO(jpp): figure out how to lower with sse2.
|
| + break;
|
| + }
|
| + static constexpr SizeT ExpectedNumElements = 8;
|
| + assert(ExpectedNumElements == Instr->getNumIndexes());
|
| + (void)ExpectedNumElements;
|
| + const SizeT Index0 = Instr->getIndex(0)->getValue();
|
| + const SizeT Index1 = Instr->getIndex(1)->getValue();
|
| + const SizeT Index2 = Instr->getIndex(2)->getValue();
|
| + const SizeT Index3 = Instr->getIndex(3)->getValue();
|
| + const SizeT Index4 = Instr->getIndex(4)->getValue();
|
| + const SizeT Index5 = Instr->getIndex(5)->getValue();
|
| + const SizeT Index6 = Instr->getIndex(6)->getValue();
|
| + const SizeT Index7 = Instr->getIndex(7)->getValue();
|
| +#define TO_BYTE_INDEX(I) ((I) << 1)
|
| + lowerShuffleVector_UsingPshufb(
|
| + Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
|
| + TO_BYTE_INDEX(Index1), TO_BYTE_INDEX(Index1) + 1, TO_BYTE_INDEX(Index2),
|
| + TO_BYTE_INDEX(Index2) + 1, TO_BYTE_INDEX(Index3),
|
| + TO_BYTE_INDEX(Index3) + 1, TO_BYTE_INDEX(Index4),
|
| + TO_BYTE_INDEX(Index4) + 1, TO_BYTE_INDEX(Index5),
|
| + TO_BYTE_INDEX(Index5) + 1, TO_BYTE_INDEX(Index6),
|
| + TO_BYTE_INDEX(Index6) + 1, TO_BYTE_INDEX(Index7),
|
| + TO_BYTE_INDEX(Index7) + 1);
|
| +#undef TO_BYTE_INDEX
|
| + return;
|
| + }
|
| case IceType_v4i1:
|
| case IceType_v4i32:
|
| case IceType_v4f32: {
|
|
|