OLD | NEW |
(Empty) | |
| 1 //===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===/
/ |
| 2 // |
| 3 // The LLVM Compiler Infrastructure |
| 4 // |
| 5 // This file is distributed under the University of Illinois Open Source |
| 6 // License. See LICENSE.TXT for details. |
| 7 // |
| 8 /// \file |
| 9 //==-----------------------------------------------------------------------===// |
| 10 |
| 11 #define DEBUG_TYPE "PeepholeOpt" |
| 12 #ifdef DEBUG |
| 13 #define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) |
| 14 #else |
| 15 #define DEBUGME 0 |
| 16 #endif |
| 17 |
| 18 #include "AMDILDevices.h" |
| 19 #include "AMDGPUInstrInfo.h" |
| 20 #include "llvm/ADT/Statistic.h" |
| 21 #include "llvm/ADT/StringExtras.h" |
| 22 #include "llvm/ADT/StringRef.h" |
| 23 #include "llvm/ADT/Twine.h" |
| 24 #include "llvm/IR/Constants.h" |
| 25 #include "llvm/CodeGen/MachineFunction.h" |
| 26 #include "llvm/CodeGen/MachineFunctionAnalysis.h" |
| 27 #include "llvm/IR/Function.h" |
| 28 #include "llvm/IR/Instructions.h" |
| 29 #include "llvm/IR/Module.h" |
| 30 #include "llvm/Support/Debug.h" |
| 31 #include "llvm/Support/MathExtras.h" |
| 32 |
| 33 #include <sstream> |
| 34 |
| 35 #if 0 |
| 36 STATISTIC(PointerAssignments, "Number of dynamic pointer " |
| 37 "assigments discovered"); |
| 38 STATISTIC(PointerSubtract, "Number of pointer subtractions discovered"); |
| 39 #endif |
| 40 |
| 41 using namespace llvm; |
| 42 // The Peephole optimization pass is used to do simple last minute optimizations |
| 43 // that are required for correct code or to remove redundant functions |
| 44 namespace { |
| 45 |
| 46 class OpaqueType; |
| 47 |
| 48 class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass { |
| 49 public: |
| 50 TargetMachine &TM; |
| 51 static char ID; |
| 52 AMDGPUPeepholeOpt(TargetMachine &tm); |
| 53 ~AMDGPUPeepholeOpt(); |
| 54 const char *getPassName() const; |
| 55 bool runOnFunction(Function &F); |
| 56 bool doInitialization(Module &M); |
| 57 bool doFinalization(Module &M); |
| 58 void getAnalysisUsage(AnalysisUsage &AU) const; |
| 59 protected: |
| 60 private: |
| 61 // Function to initiate all of the instruction level optimizations. |
| 62 bool instLevelOptimizations(BasicBlock::iterator *inst); |
| 63 // Quick check to see if we need to dump all of the pointers into the |
| 64 // arena. If this is correct, then we set all pointers to exist in arena. This |
| 65 // is a workaround for aliasing of pointers in a struct/union. |
| 66 bool dumpAllIntoArena(Function &F); |
| 67 // Because I don't want to invalidate any pointers while in the |
| 68 // safeNestedForEachFunction. I push atomic conversions to a vector and handle |
| 69 // it later. This function does the conversions if required. |
| 70 void doAtomicConversionIfNeeded(Function &F); |
| 71 // Because __amdil_is_constant cannot be properly evaluated if |
| 72 // optimizations are disabled, the call's are placed in a vector |
| 73 // and evaluated after the __amdil_image* functions are evaluated |
| 74 // which should allow the __amdil_is_constant function to be |
| 75 // evaluated correctly. |
| 76 void doIsConstCallConversionIfNeeded(); |
| 77 bool mChanged; |
| 78 bool mDebug; |
| 79 bool mConvertAtomics; |
| 80 CodeGenOpt::Level optLevel; |
| 81 // Run a series of tests to see if we can optimize a CALL instruction. |
| 82 bool optimizeCallInst(BasicBlock::iterator *bbb); |
| 83 // A peephole optimization to optimize bit extract sequences. |
| 84 bool optimizeBitExtract(Instruction *inst); |
| 85 // A peephole optimization to optimize bit insert sequences. |
| 86 bool optimizeBitInsert(Instruction *inst); |
| 87 bool setupBitInsert(Instruction *base, |
| 88 Instruction *&src, |
| 89 Constant *&mask, |
| 90 Constant *&shift); |
| 91 // Expand the bit field insert instruction on versions of OpenCL that |
| 92 // don't support it. |
| 93 bool expandBFI(CallInst *CI); |
| 94 // Expand the bit field mask instruction on version of OpenCL that |
| 95 // don't support it. |
| 96 bool expandBFM(CallInst *CI); |
| 97 // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in |
| 98 // this case we need to expand them. These functions check for 24bit functions |
| 99 // and then expand. |
| 100 bool isSigned24BitOps(CallInst *CI); |
| 101 void expandSigned24BitOps(CallInst *CI); |
| 102 // One optimization that can occur is that if the required workgroup size is |
| 103 // specified then the result of get_local_size is known at compile time and |
| 104 // can be returned accordingly. |
| 105 bool isRWGLocalOpt(CallInst *CI); |
| 106 // On northern island cards, the division is slightly less accurate than on |
| 107 // previous generations, so we need to utilize a more accurate division. So we |
| 108 // can translate the accurate divide to a normal divide on all other cards. |
| 109 bool convertAccurateDivide(CallInst *CI); |
| 110 void expandAccurateDivide(CallInst *CI); |
| 111 // If the alignment is set incorrectly, it can produce really inefficient |
| 112 // code. This checks for this scenario and fixes it if possible. |
| 113 bool correctMisalignedMemOp(Instruction *inst); |
| 114 |
| 115 // If we are in no opt mode, then we need to make sure that |
| 116 // local samplers are properly propagated as constant propagation |
| 117 // doesn't occur and we need to know the value of kernel defined |
| 118 // samplers at compile time. |
| 119 bool propagateSamplerInst(CallInst *CI); |
| 120 |
| 121 // Helper functions |
| 122 |
| 123 // Group of functions that recursively calculate the size of a structure based |
| 124 // on it's sub-types. |
| 125 size_t getTypeSize(Type * const T, bool dereferencePtr = false); |
| 126 size_t getTypeSize(StructType * const ST, bool dereferencePtr = false); |
| 127 size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false); |
| 128 size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false); |
| 129 size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false); |
| 130 size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false); |
| 131 size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false); |
| 132 size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false); |
| 133 |
| 134 LLVMContext *mCTX; |
| 135 Function *mF; |
| 136 const AMDGPUSubtarget *mSTM; |
| 137 SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs; |
| 138 SmallVector<CallInst *, 16> isConstVec; |
| 139 }; // class AMDGPUPeepholeOpt |
| 140 char AMDGPUPeepholeOpt::ID = 0; |
| 141 |
| 142 // A template function that has two levels of looping before calling the |
| 143 // function with a pointer to the current iterator. |
| 144 template<class InputIterator, class SecondIterator, class Function> |
| 145 Function safeNestedForEach(InputIterator First, InputIterator Last, |
| 146 SecondIterator S, Function F) { |
| 147 for ( ; First != Last; ++First) { |
| 148 SecondIterator sf, sl; |
| 149 for (sf = First->begin(), sl = First->end(); |
| 150 sf != sl; ) { |
| 151 if (!F(&sf)) { |
| 152 ++sf; |
| 153 } |
| 154 } |
| 155 } |
| 156 return F; |
| 157 } |
| 158 |
| 159 } // anonymous namespace |
| 160 |
| 161 namespace llvm { |
| 162 FunctionPass * |
| 163 createAMDGPUPeepholeOpt(TargetMachine &tm) { |
| 164 return new AMDGPUPeepholeOpt(tm); |
| 165 } |
| 166 } // llvm namespace |
| 167 |
| 168 AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm) |
| 169 : FunctionPass(ID), TM(tm) { |
| 170 mDebug = DEBUGME; |
| 171 optLevel = TM.getOptLevel(); |
| 172 |
| 173 } |
| 174 |
| 175 AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() { |
| 176 } |
| 177 |
| 178 const char * |
| 179 AMDGPUPeepholeOpt::getPassName() const { |
| 180 return "AMDGPU PeepHole Optimization Pass"; |
| 181 } |
| 182 |
| 183 bool |
| 184 containsPointerType(Type *Ty) { |
| 185 if (!Ty) { |
| 186 return false; |
| 187 } |
| 188 switch(Ty->getTypeID()) { |
| 189 default: |
| 190 return false; |
| 191 case Type::StructTyID: { |
| 192 const StructType *ST = dyn_cast<StructType>(Ty); |
| 193 for (StructType::element_iterator stb = ST->element_begin(), |
| 194 ste = ST->element_end(); stb != ste; ++stb) { |
| 195 if (!containsPointerType(*stb)) { |
| 196 continue; |
| 197 } |
| 198 return true; |
| 199 } |
| 200 break; |
| 201 } |
| 202 case Type::VectorTyID: |
| 203 case Type::ArrayTyID: |
| 204 return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType()); |
| 205 case Type::PointerTyID: |
| 206 return true; |
| 207 }; |
| 208 return false; |
| 209 } |
| 210 |
| 211 bool |
| 212 AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) { |
| 213 bool dumpAll = false; |
| 214 for (Function::const_arg_iterator cab = F.arg_begin(), |
| 215 cae = F.arg_end(); cab != cae; ++cab) { |
| 216 const Argument *arg = cab; |
| 217 const PointerType *PT = dyn_cast<PointerType>(arg->getType()); |
| 218 if (!PT) { |
| 219 continue; |
| 220 } |
| 221 Type *DereferencedType = PT->getElementType(); |
| 222 if (!dyn_cast<StructType>(DereferencedType) |
| 223 ) { |
| 224 continue; |
| 225 } |
| 226 if (!containsPointerType(DereferencedType)) { |
| 227 continue; |
| 228 } |
| 229 // FIXME: Because a pointer inside of a struct/union may be aliased to |
| 230 // another pointer we need to take the conservative approach and place all |
| 231 // pointers into the arena until more advanced detection is implemented. |
| 232 dumpAll = true; |
| 233 } |
| 234 return dumpAll; |
| 235 } |
| 236 void |
| 237 AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() { |
| 238 if (isConstVec.empty()) { |
| 239 return; |
| 240 } |
| 241 for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) { |
| 242 CallInst *CI = isConstVec[x]; |
| 243 Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); |
| 244 Type *aType = Type::getInt32Ty(*mCTX); |
| 245 Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) |
| 246 : ConstantInt::get(aType, 0); |
| 247 CI->replaceAllUsesWith(Val); |
| 248 CI->eraseFromParent(); |
| 249 } |
| 250 isConstVec.clear(); |
| 251 } |
| 252 void |
| 253 AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) { |
| 254 // Don't do anything if we don't have any atomic operations. |
| 255 if (atomicFuncs.empty()) { |
| 256 return; |
| 257 } |
| 258 // Change the function name for the atomic if it is required |
| 259 uint32_t size = atomicFuncs.size(); |
| 260 for (uint32_t x = 0; x < size; ++x) { |
| 261 atomicFuncs[x].first->setOperand( |
| 262 atomicFuncs[x].first->getNumOperands()-1, |
| 263 atomicFuncs[x].second); |
| 264 |
| 265 } |
| 266 mChanged = true; |
| 267 if (mConvertAtomics) { |
| 268 return; |
| 269 } |
| 270 } |
| 271 |
| 272 bool |
| 273 AMDGPUPeepholeOpt::runOnFunction(Function &MF) { |
| 274 mChanged = false; |
| 275 mF = &MF; |
| 276 mSTM = &TM.getSubtarget<AMDGPUSubtarget>(); |
| 277 if (mDebug) { |
| 278 MF.dump(); |
| 279 } |
| 280 mCTX = &MF.getType()->getContext(); |
| 281 mConvertAtomics = true; |
| 282 safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(), |
| 283 std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations), |
| 284 this)); |
| 285 |
| 286 doAtomicConversionIfNeeded(MF); |
| 287 doIsConstCallConversionIfNeeded(); |
| 288 |
| 289 if (mDebug) { |
| 290 MF.dump(); |
| 291 } |
| 292 return mChanged; |
| 293 } |
| 294 |
| 295 bool |
| 296 AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) { |
| 297 Instruction *inst = (*bbb); |
| 298 CallInst *CI = dyn_cast<CallInst>(inst); |
| 299 if (!CI) { |
| 300 return false; |
| 301 } |
| 302 if (isSigned24BitOps(CI)) { |
| 303 expandSigned24BitOps(CI); |
| 304 ++(*bbb); |
| 305 CI->eraseFromParent(); |
| 306 return true; |
| 307 } |
| 308 if (propagateSamplerInst(CI)) { |
| 309 return false; |
| 310 } |
| 311 if (expandBFI(CI) || expandBFM(CI)) { |
| 312 ++(*bbb); |
| 313 CI->eraseFromParent(); |
| 314 return true; |
| 315 } |
| 316 if (convertAccurateDivide(CI)) { |
| 317 expandAccurateDivide(CI); |
| 318 ++(*bbb); |
| 319 CI->eraseFromParent(); |
| 320 return true; |
| 321 } |
| 322 |
| 323 StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName(); |
| 324 if (calleeName.startswith("__amdil_is_constant")) { |
| 325 // If we do not have optimizations, then this |
| 326 // cannot be properly evaluated, so we add the |
| 327 // call instruction to a vector and process |
| 328 // them at the end of processing after the |
| 329 // samplers have been correctly handled. |
| 330 if (optLevel == CodeGenOpt::None) { |
| 331 isConstVec.push_back(CI); |
| 332 return false; |
| 333 } else { |
| 334 Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); |
| 335 Type *aType = Type::getInt32Ty(*mCTX); |
| 336 Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) |
| 337 : ConstantInt::get(aType, 0); |
| 338 CI->replaceAllUsesWith(Val); |
| 339 ++(*bbb); |
| 340 CI->eraseFromParent(); |
| 341 return true; |
| 342 } |
| 343 } |
| 344 |
| 345 if (calleeName.equals("__amdil_is_asic_id_i32")) { |
| 346 ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0)); |
| 347 Type *aType = Type::getInt32Ty(*mCTX); |
| 348 Value *Val = CV; |
| 349 if (Val) { |
| 350 Val = ConstantInt::get(aType, |
| 351 mSTM->device()->getDeviceFlag() & CV->getZExtValue()); |
| 352 } else { |
| 353 Val = ConstantInt::get(aType, 0); |
| 354 } |
| 355 CI->replaceAllUsesWith(Val); |
| 356 ++(*bbb); |
| 357 CI->eraseFromParent(); |
| 358 return true; |
| 359 } |
| 360 Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1)); |
| 361 if (!F) { |
| 362 return false; |
| 363 } |
| 364 if (F->getName().startswith("__atom") && !CI->getNumUses() |
| 365 && F->getName().find("_xchg") == StringRef::npos) { |
| 366 std::string buffer(F->getName().str() + "_noret"); |
| 367 F = dyn_cast<Function>( |
| 368 F->getParent()->getOrInsertFunction(buffer, F->getFunctionType())); |
| 369 atomicFuncs.push_back(std::make_pair(CI, F)); |
| 370 } |
| 371 |
| 372 if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment) |
| 373 && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) { |
| 374 return false; |
| 375 } |
| 376 if (!mConvertAtomics) { |
| 377 return false; |
| 378 } |
| 379 StringRef name = F->getName(); |
| 380 if (name.startswith("__atom") && name.find("_g") != StringRef::npos) { |
| 381 mConvertAtomics = false; |
| 382 } |
| 383 return false; |
| 384 } |
| 385 |
| 386 bool |
| 387 AMDGPUPeepholeOpt::setupBitInsert(Instruction *base, |
| 388 Instruction *&src, |
| 389 Constant *&mask, |
| 390 Constant *&shift) { |
| 391 if (!base) { |
| 392 if (mDebug) { |
| 393 dbgs() << "Null pointer passed into function.\n"; |
| 394 } |
| 395 return false; |
| 396 } |
| 397 bool andOp = false; |
| 398 if (base->getOpcode() == Instruction::Shl) { |
| 399 shift = dyn_cast<Constant>(base->getOperand(1)); |
| 400 } else if (base->getOpcode() == Instruction::And) { |
| 401 mask = dyn_cast<Constant>(base->getOperand(1)); |
| 402 andOp = true; |
| 403 } else { |
| 404 if (mDebug) { |
| 405 dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n"; |
| 406 } |
| 407 // If the base is neither a Shl or a And, we don't fit any of the patterns a
bove. |
| 408 return false; |
| 409 } |
| 410 src = dyn_cast<Instruction>(base->getOperand(0)); |
| 411 if (!src) { |
| 412 if (mDebug) { |
| 413 dbgs() << "Failed setup since the base operand is not an instruction!\n"; |
| 414 } |
| 415 return false; |
| 416 } |
| 417 // If we find an 'and' operation, then we don't need to |
| 418 // find the next operation as we already know the |
| 419 // bits that are valid at this point. |
| 420 if (andOp) { |
| 421 return true; |
| 422 } |
| 423 if (src->getOpcode() == Instruction::Shl && !shift) { |
| 424 shift = dyn_cast<Constant>(src->getOperand(1)); |
| 425 src = dyn_cast<Instruction>(src->getOperand(0)); |
| 426 } else if (src->getOpcode() == Instruction::And && !mask) { |
| 427 mask = dyn_cast<Constant>(src->getOperand(1)); |
| 428 } |
| 429 if (!mask && !shift) { |
| 430 if (mDebug) { |
| 431 dbgs() << "Failed setup since both mask and shift are NULL!\n"; |
| 432 } |
| 433 // Did not find a constant mask or a shift. |
| 434 return false; |
| 435 } |
| 436 return true; |
| 437 } |
| 438 bool |
| 439 AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) { |
| 440 if (!inst) { |
| 441 return false; |
| 442 } |
| 443 if (!inst->isBinaryOp()) { |
| 444 return false; |
| 445 } |
| 446 if (inst->getOpcode() != Instruction::Or) { |
| 447 return false; |
| 448 } |
| 449 if (optLevel == CodeGenOpt::None) { |
| 450 return false; |
| 451 } |
| 452 // We want to do an optimization on a sequence of ops that in the end equals a |
| 453 // single ISA instruction. |
| 454 // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F) |
| 455 // Some simplified versions of this pattern are as follows: |
| 456 // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0 |
| 457 // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E |
| 458 // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B |
| 459 // (A & B) | (D << F) when (1 << F) >= B |
| 460 // (A << C) | (D & E) when (1 << C) >= E |
| 461 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { |
| 462 // The HD4XXX hardware doesn't support the ubit_insert instruction. |
| 463 return false; |
| 464 } |
| 465 Type *aType = inst->getType(); |
| 466 bool isVector = aType->isVectorTy(); |
| 467 int numEle = 1; |
| 468 // This optimization only works on 32bit integers. |
| 469 if (aType->getScalarType() |
| 470 != Type::getInt32Ty(inst->getContext())) { |
| 471 return false; |
| 472 } |
| 473 if (isVector) { |
| 474 const VectorType *VT = dyn_cast<VectorType>(aType); |
| 475 numEle = VT->getNumElements(); |
| 476 // We currently cannot support more than 4 elements in a intrinsic and we |
| 477 // cannot support Vec3 types. |
| 478 if (numEle > 4 || numEle == 3) { |
| 479 return false; |
| 480 } |
| 481 } |
| 482 // TODO: Handle vectors. |
| 483 if (isVector) { |
| 484 if (mDebug) { |
| 485 dbgs() << "!!! Vectors are not supported yet!\n"; |
| 486 } |
| 487 return false; |
| 488 } |
| 489 Instruction *LHSSrc = NULL, *RHSSrc = NULL; |
| 490 Constant *LHSMask = NULL, *RHSMask = NULL; |
| 491 Constant *LHSShift = NULL, *RHSShift = NULL; |
| 492 Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0)); |
| 493 Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1)); |
| 494 if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) { |
| 495 if (mDebug) { |
| 496 dbgs() << "Found an OR Operation that failed setup!\n"; |
| 497 inst->dump(); |
| 498 if (LHS) { LHS->dump(); } |
| 499 if (LHSSrc) { LHSSrc->dump(); } |
| 500 if (LHSMask) { LHSMask->dump(); } |
| 501 if (LHSShift) { LHSShift->dump(); } |
| 502 } |
| 503 // There was an issue with the setup for BitInsert. |
| 504 return false; |
| 505 } |
| 506 if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) { |
| 507 if (mDebug) { |
| 508 dbgs() << "Found an OR Operation that failed setup!\n"; |
| 509 inst->dump(); |
| 510 if (RHS) { RHS->dump(); } |
| 511 if (RHSSrc) { RHSSrc->dump(); } |
| 512 if (RHSMask) { RHSMask->dump(); } |
| 513 if (RHSShift) { RHSShift->dump(); } |
| 514 } |
| 515 // There was an issue with the setup for BitInsert. |
| 516 return false; |
| 517 } |
| 518 if (mDebug) { |
| 519 dbgs() << "Found an OR operation that can possible be optimized to ubit inse
rt!\n"; |
| 520 dbgs() << "Op: "; inst->dump(); |
| 521 dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\
n"; } |
| 522 dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(
None)\n"; } |
| 523 dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() <<
"(None)\n"; } |
| 524 dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() <
< "(None)\n"; } |
| 525 dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\
n"; } |
| 526 dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(
None)\n"; } |
| 527 dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() <<
"(None)\n"; } |
| 528 dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() <
< "(None)\n"; } |
| 529 } |
| 530 Constant *offset = NULL; |
| 531 Constant *width = NULL; |
| 532 uint32_t lhsMaskVal = 0, rhsMaskVal = 0; |
| 533 uint32_t lhsShiftVal = 0, rhsShiftVal = 0; |
| 534 uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0; |
| 535 uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0; |
| 536 lhsMaskVal = (LHSMask |
| 537 ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0); |
| 538 rhsMaskVal = (RHSMask |
| 539 ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0); |
| 540 lhsShiftVal = (LHSShift |
| 541 ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0); |
| 542 rhsShiftVal = (RHSShift |
| 543 ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0); |
| 544 lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal; |
| 545 rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal; |
| 546 lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal; |
| 547 rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal; |
| 548 // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks). |
| 549 if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) { |
| 550 return false; |
| 551 } |
| 552 if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) { |
| 553 offset = ConstantInt::get(aType, lhsMaskOffset, false); |
| 554 width = ConstantInt::get(aType, lhsMaskWidth, false); |
| 555 RHSSrc = RHS; |
| 556 if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) { |
| 557 return false; |
| 558 } |
| 559 if (!LHSShift) { |
| 560 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, |
| 561 "MaskShr", LHS); |
| 562 } else if (lhsShiftVal != lhsMaskOffset) { |
| 563 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, |
| 564 "MaskShr", LHS); |
| 565 } |
| 566 if (mDebug) { |
| 567 dbgs() << "Optimizing LHS!\n"; |
| 568 } |
| 569 } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) { |
| 570 offset = ConstantInt::get(aType, rhsMaskOffset, false); |
| 571 width = ConstantInt::get(aType, rhsMaskWidth, false); |
| 572 LHSSrc = RHSSrc; |
| 573 RHSSrc = LHS; |
| 574 if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) { |
| 575 return false; |
| 576 } |
| 577 if (!RHSShift) { |
| 578 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, |
| 579 "MaskShr", RHS); |
| 580 } else if (rhsShiftVal != rhsMaskOffset) { |
| 581 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, |
| 582 "MaskShr", RHS); |
| 583 } |
| 584 if (mDebug) { |
| 585 dbgs() << "Optimizing RHS!\n"; |
| 586 } |
| 587 } else { |
| 588 if (mDebug) { |
| 589 dbgs() << "Failed constraint 3!\n"; |
| 590 } |
| 591 return false; |
| 592 } |
| 593 if (mDebug) { |
| 594 dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"
; } |
| 595 dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\
n"; } |
| 596 dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\
n"; } |
| 597 dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\
n"; } |
| 598 } |
| 599 if (!offset || !width) { |
| 600 if (mDebug) { |
| 601 dbgs() << "Either width or offset are NULL, failed detection!\n"; |
| 602 } |
| 603 return false; |
| 604 } |
| 605 // Lets create the function signature. |
| 606 std::vector<Type *> callTypes; |
| 607 callTypes.push_back(aType); |
| 608 callTypes.push_back(aType); |
| 609 callTypes.push_back(aType); |
| 610 callTypes.push_back(aType); |
| 611 FunctionType *funcType = FunctionType::get(aType, callTypes, false); |
| 612 std::string name = "__amdil_ubit_insert"; |
| 613 if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"
; } |
| 614 Function *Func = |
| 615 dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> |
| 616 getOrInsertFunction(StringRef(name), funcType)); |
| 617 Value *Operands[4] = { |
| 618 width, |
| 619 offset, |
| 620 LHSSrc, |
| 621 RHSSrc |
| 622 }; |
| 623 CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt"); |
| 624 if (mDebug) { |
| 625 dbgs() << "Old Inst: "; |
| 626 inst->dump(); |
| 627 dbgs() << "New Inst: "; |
| 628 CI->dump(); |
| 629 dbgs() << "\n\n"; |
| 630 } |
| 631 CI->insertBefore(inst); |
| 632 inst->replaceAllUsesWith(CI); |
| 633 return true; |
| 634 } |
| 635 |
| 636 bool |
| 637 AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) { |
| 638 if (!inst) { |
| 639 return false; |
| 640 } |
| 641 if (!inst->isBinaryOp()) { |
| 642 return false; |
| 643 } |
| 644 if (inst->getOpcode() != Instruction::And) { |
| 645 return false; |
| 646 } |
| 647 if (optLevel == CodeGenOpt::None) { |
| 648 return false; |
| 649 } |
| 650 // We want to do some simple optimizations on Shift right/And patterns. The |
| 651 // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a |
| 652 // value smaller than 32 and C is a mask. If C is a constant value, then the |
| 653 // following transformation can occur. For signed integers, it turns into the |
| 654 // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned |
| 655 // integers, it turns into the function call dst = |
| 656 // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract |
| 657 // can be found in Section 7.9 of the ATI IL spec of the stream SDK for |
| 658 // Evergreen hardware. |
| 659 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { |
| 660 // This does not work on HD4XXX hardware. |
| 661 return false; |
| 662 } |
| 663 Type *aType = inst->getType(); |
| 664 bool isVector = aType->isVectorTy(); |
| 665 |
| 666 // XXX Support vector types |
| 667 if (isVector) { |
| 668 return false; |
| 669 } |
| 670 int numEle = 1; |
| 671 // This only works on 32bit integers |
| 672 if (aType->getScalarType() |
| 673 != Type::getInt32Ty(inst->getContext())) { |
| 674 return false; |
| 675 } |
| 676 if (isVector) { |
| 677 const VectorType *VT = dyn_cast<VectorType>(aType); |
| 678 numEle = VT->getNumElements(); |
| 679 // We currently cannot support more than 4 elements in a intrinsic and we |
| 680 // cannot support Vec3 types. |
| 681 if (numEle > 4 || numEle == 3) { |
| 682 return false; |
| 683 } |
| 684 } |
| 685 BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0)); |
| 686 // If the first operand is not a shift instruction, then we can return as it |
| 687 // doesn't match this pattern. |
| 688 if (!ShiftInst || !ShiftInst->isShift()) { |
| 689 return false; |
| 690 } |
| 691 // If we are a shift left, then we need don't match this pattern. |
| 692 if (ShiftInst->getOpcode() == Instruction::Shl) { |
| 693 return false; |
| 694 } |
| 695 bool isSigned = ShiftInst->isArithmeticShift(); |
| 696 Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1)); |
| 697 Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1)); |
| 698 // Lets make sure that the shift value and the and mask are constant integers. |
| 699 if (!AndMask || !ShrVal) { |
| 700 return false; |
| 701 } |
| 702 Constant *newMaskConst; |
| 703 Constant *shiftValConst; |
| 704 if (isVector) { |
| 705 // Handle the vector case |
| 706 std::vector<Constant *> maskVals; |
| 707 std::vector<Constant *> shiftVals; |
| 708 ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask); |
| 709 ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal); |
| 710 Type *scalarType = AndMaskVec->getType()->getScalarType(); |
| 711 assert(AndMaskVec->getNumOperands() == |
| 712 ShrValVec->getNumOperands() && "cannot have a " |
| 713 "combination where the number of elements to a " |
| 714 "shift and an and are different!"); |
| 715 for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) { |
| 716 ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x)); |
| 717 ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x)); |
| 718 if (!AndCI || !ShiftIC) { |
| 719 return false; |
| 720 } |
| 721 uint32_t maskVal = (uint32_t)AndCI->getZExtValue(); |
| 722 if (!isMask_32(maskVal)) { |
| 723 return false; |
| 724 } |
| 725 maskVal = (uint32_t)CountTrailingOnes_32(maskVal); |
| 726 uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue(); |
| 727 // If the mask or shiftval is greater than the bitcount, then break out. |
| 728 if (maskVal >= 32 || shiftVal >= 32) { |
| 729 return false; |
| 730 } |
| 731 // If the mask val is greater than the the number of original bits left |
| 732 // then this optimization is invalid. |
| 733 if (maskVal > (32 - shiftVal)) { |
| 734 return false; |
| 735 } |
| 736 maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned)); |
| 737 shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned)); |
| 738 } |
| 739 newMaskConst = ConstantVector::get(maskVals); |
| 740 shiftValConst = ConstantVector::get(shiftVals); |
| 741 } else { |
| 742 // Handle the scalar case |
| 743 uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue(); |
| 744 // This must be a mask value where all lower bits are set to 1 and then any |
| 745 // bit higher is set to 0. |
| 746 if (!isMask_32(maskVal)) { |
| 747 return false; |
| 748 } |
| 749 maskVal = (uint32_t)CountTrailingOnes_32(maskVal); |
| 750 // Count the number of bits set in the mask, this is the width of the |
| 751 // resulting bit set that is extracted from the source value. |
| 752 uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue(); |
| 753 // If the mask or shift val is greater than the bitcount, then break out. |
| 754 if (maskVal >= 32 || shiftVal >= 32) { |
| 755 return false; |
| 756 } |
| 757 // If the mask val is greater than the the number of original bits left then |
| 758 // this optimization is invalid. |
| 759 if (maskVal > (32 - shiftVal)) { |
| 760 return false; |
| 761 } |
| 762 newMaskConst = ConstantInt::get(aType, maskVal, isSigned); |
| 763 shiftValConst = ConstantInt::get(aType, shiftVal, isSigned); |
| 764 } |
| 765 // Lets create the function signature. |
| 766 std::vector<Type *> callTypes; |
| 767 callTypes.push_back(aType); |
| 768 callTypes.push_back(aType); |
| 769 callTypes.push_back(aType); |
| 770 FunctionType *funcType = FunctionType::get(aType, callTypes, false); |
| 771 std::string name = "llvm.AMDGPU.bit.extract.u32"; |
| 772 if (isVector) { |
| 773 name += ".v" + itostr(numEle) + "i32"; |
| 774 } else { |
| 775 name += "."; |
| 776 } |
| 777 // Lets create the function. |
| 778 Function *Func = |
| 779 dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> |
| 780 getOrInsertFunction(StringRef(name), funcType)); |
| 781 Value *Operands[3] = { |
| 782 ShiftInst->getOperand(0), |
| 783 shiftValConst, |
| 784 newMaskConst |
| 785 }; |
| 786 // Lets create the Call with the operands |
| 787 CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt"); |
| 788 CI->setDoesNotAccessMemory(); |
| 789 CI->insertBefore(inst); |
| 790 inst->replaceAllUsesWith(CI); |
| 791 return true; |
| 792 } |
| 793 |
| 794 bool |
| 795 AMDGPUPeepholeOpt::expandBFI(CallInst *CI) { |
| 796 if (!CI) { |
| 797 return false; |
| 798 } |
| 799 Value *LHS = CI->getOperand(CI->getNumOperands() - 1); |
| 800 if (!LHS->getName().startswith("__amdil_bfi")) { |
| 801 return false; |
| 802 } |
| 803 Type* type = CI->getOperand(0)->getType(); |
| 804 Constant *negOneConst = NULL; |
| 805 if (type->isVectorTy()) { |
| 806 std::vector<Constant *> negOneVals; |
| 807 negOneConst = ConstantInt::get(CI->getContext(), |
| 808 APInt(32, StringRef("-1"), 10)); |
| 809 for (size_t x = 0, |
| 810 y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { |
| 811 negOneVals.push_back(negOneConst); |
| 812 } |
| 813 negOneConst = ConstantVector::get(negOneVals); |
| 814 } else { |
| 815 negOneConst = ConstantInt::get(CI->getContext(), |
| 816 APInt(32, StringRef("-1"), 10)); |
| 817 } |
| 818 // __amdil_bfi => (A & B) | (~A & C) |
| 819 BinaryOperator *lhs = |
| 820 BinaryOperator::Create(Instruction::And, CI->getOperand(0), |
| 821 CI->getOperand(1), "bfi_and", CI); |
| 822 BinaryOperator *rhs = |
| 823 BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst, |
| 824 "bfi_not", CI); |
| 825 rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2), |
| 826 "bfi_and", CI); |
| 827 lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI); |
| 828 CI->replaceAllUsesWith(lhs); |
| 829 return true; |
| 830 } |
| 831 |
| 832 bool |
| 833 AMDGPUPeepholeOpt::expandBFM(CallInst *CI) { |
| 834 if (!CI) { |
| 835 return false; |
| 836 } |
| 837 Value *LHS = CI->getOperand(CI->getNumOperands() - 1); |
| 838 if (!LHS->getName().startswith("__amdil_bfm")) { |
| 839 return false; |
| 840 } |
| 841 // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f) |
| 842 Constant *newMaskConst = NULL; |
| 843 Constant *newShiftConst = NULL; |
| 844 Type* type = CI->getOperand(0)->getType(); |
| 845 if (type->isVectorTy()) { |
| 846 std::vector<Constant*> newMaskVals, newShiftVals; |
| 847 newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); |
| 848 newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); |
| 849 for (size_t x = 0, |
| 850 y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { |
| 851 newMaskVals.push_back(newMaskConst); |
| 852 newShiftVals.push_back(newShiftConst); |
| 853 } |
| 854 newMaskConst = ConstantVector::get(newMaskVals); |
| 855 newShiftConst = ConstantVector::get(newShiftVals); |
| 856 } else { |
| 857 newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); |
| 858 newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); |
| 859 } |
| 860 BinaryOperator *lhs = |
| 861 BinaryOperator::Create(Instruction::And, CI->getOperand(0), |
| 862 newMaskConst, "bfm_mask", CI); |
| 863 lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst, |
| 864 lhs, "bfm_shl", CI); |
| 865 lhs = BinaryOperator::Create(Instruction::Sub, lhs, |
| 866 newShiftConst, "bfm_sub", CI); |
| 867 BinaryOperator *rhs = |
| 868 BinaryOperator::Create(Instruction::And, CI->getOperand(1), |
| 869 newMaskConst, "bfm_mask", CI); |
| 870 lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI); |
| 871 CI->replaceAllUsesWith(lhs); |
| 872 return true; |
| 873 } |
| 874 |
| 875 bool |
| 876 AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) { |
| 877 Instruction *inst = (*bbb); |
| 878 if (optimizeCallInst(bbb)) { |
| 879 return true; |
| 880 } |
| 881 if (optimizeBitExtract(inst)) { |
| 882 return false; |
| 883 } |
| 884 if (optimizeBitInsert(inst)) { |
| 885 return false; |
| 886 } |
| 887 if (correctMisalignedMemOp(inst)) { |
| 888 return false; |
| 889 } |
| 890 return false; |
| 891 } |
| 892 bool |
| 893 AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) { |
| 894 LoadInst *linst = dyn_cast<LoadInst>(inst); |
| 895 StoreInst *sinst = dyn_cast<StoreInst>(inst); |
| 896 unsigned alignment; |
| 897 Type* Ty = inst->getType(); |
| 898 if (linst) { |
| 899 alignment = linst->getAlignment(); |
| 900 Ty = inst->getType(); |
| 901 } else if (sinst) { |
| 902 alignment = sinst->getAlignment(); |
| 903 Ty = sinst->getValueOperand()->getType(); |
| 904 } else { |
| 905 return false; |
| 906 } |
| 907 unsigned size = getTypeSize(Ty); |
| 908 if (size == alignment || size < alignment) { |
| 909 return false; |
| 910 } |
| 911 if (!Ty->isStructTy()) { |
| 912 return false; |
| 913 } |
| 914 if (alignment < 4) { |
| 915 if (linst) { |
| 916 linst->setAlignment(0); |
| 917 return true; |
| 918 } else if (sinst) { |
| 919 sinst->setAlignment(0); |
| 920 return true; |
| 921 } |
| 922 } |
| 923 return false; |
| 924 } |
| 925 bool |
| 926 AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) { |
| 927 if (!CI) { |
| 928 return false; |
| 929 } |
| 930 Value *LHS = CI->getOperand(CI->getNumOperands() - 1); |
| 931 std::string namePrefix = LHS->getName().substr(0, 14); |
| 932 if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24" |
| 933 && namePrefix != "__amdil__imul24_high") { |
| 934 return false; |
| 935 } |
| 936 if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) { |
| 937 return false; |
| 938 } |
| 939 return true; |
| 940 } |
| 941 |
| 942 void |
| 943 AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) { |
| 944 assert(isSigned24BitOps(CI) && "Must be a " |
| 945 "signed 24 bit operation to call this function!"); |
| 946 Value *LHS = CI->getOperand(CI->getNumOperands()-1); |
| 947 // On 7XX and 8XX we do not have signed 24bit, so we need to |
| 948 // expand it to the following: |
| 949 // imul24 turns into 32bit imul |
| 950 // imad24 turns into 32bit imad |
| 951 // imul24_high turns into 32bit imulhigh |
| 952 if (LHS->getName().substr(0, 14) == "__amdil_imad24") { |
| 953 Type *aType = CI->getOperand(0)->getType(); |
| 954 bool isVector = aType->isVectorTy(); |
| 955 int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; |
| 956 std::vector<Type*> callTypes; |
| 957 callTypes.push_back(CI->getOperand(0)->getType()); |
| 958 callTypes.push_back(CI->getOperand(1)->getType()); |
| 959 callTypes.push_back(CI->getOperand(2)->getType()); |
| 960 FunctionType *funcType = |
| 961 FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); |
| 962 std::string name = "__amdil_imad"; |
| 963 if (isVector) { |
| 964 name += "_v" + itostr(numEle) + "i32"; |
| 965 } else { |
| 966 name += "_i32"; |
| 967 } |
| 968 Function *Func = dyn_cast<Function>( |
| 969 CI->getParent()->getParent()->getParent()-> |
| 970 getOrInsertFunction(StringRef(name), funcType)); |
| 971 Value *Operands[3] = { |
| 972 CI->getOperand(0), |
| 973 CI->getOperand(1), |
| 974 CI->getOperand(2) |
| 975 }; |
| 976 CallInst *nCI = CallInst::Create(Func, Operands, "imad24"); |
| 977 nCI->insertBefore(CI); |
| 978 CI->replaceAllUsesWith(nCI); |
| 979 } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") { |
| 980 BinaryOperator *mulOp = |
| 981 BinaryOperator::Create(Instruction::Mul, CI->getOperand(0), |
| 982 CI->getOperand(1), "imul24", CI); |
| 983 CI->replaceAllUsesWith(mulOp); |
| 984 } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") { |
| 985 Type *aType = CI->getOperand(0)->getType(); |
| 986 |
| 987 bool isVector = aType->isVectorTy(); |
| 988 int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; |
| 989 std::vector<Type*> callTypes; |
| 990 callTypes.push_back(CI->getOperand(0)->getType()); |
| 991 callTypes.push_back(CI->getOperand(1)->getType()); |
| 992 FunctionType *funcType = |
| 993 FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); |
| 994 std::string name = "__amdil_imul_high"; |
| 995 if (isVector) { |
| 996 name += "_v" + itostr(numEle) + "i32"; |
| 997 } else { |
| 998 name += "_i32"; |
| 999 } |
| 1000 Function *Func = dyn_cast<Function>( |
| 1001 CI->getParent()->getParent()->getParent()-> |
| 1002 getOrInsertFunction(StringRef(name), funcType)); |
| 1003 Value *Operands[2] = { |
| 1004 CI->getOperand(0), |
| 1005 CI->getOperand(1) |
| 1006 }; |
| 1007 CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high"); |
| 1008 nCI->insertBefore(CI); |
| 1009 CI->replaceAllUsesWith(nCI); |
| 1010 } |
| 1011 } |
| 1012 |
| 1013 bool |
| 1014 AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) { |
| 1015 return (CI != NULL |
| 1016 && CI->getOperand(CI->getNumOperands() - 1)->getName() |
| 1017 == "__amdil_get_local_size_int"); |
| 1018 } |
| 1019 |
| 1020 bool |
| 1021 AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) { |
| 1022 if (!CI) { |
| 1023 return false; |
| 1024 } |
| 1025 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX |
| 1026 && (mSTM->getDeviceName() == "cayman")) { |
| 1027 return false; |
| 1028 } |
| 1029 return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) |
| 1030 == "__amdil_improved_div"; |
| 1031 } |
| 1032 |
| 1033 void |
| 1034 AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) { |
| 1035 assert(convertAccurateDivide(CI) |
| 1036 && "expanding accurate divide can only happen if it is expandable!"); |
| 1037 BinaryOperator *divOp = |
| 1038 BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0), |
| 1039 CI->getOperand(1), "fdiv32", CI); |
| 1040 CI->replaceAllUsesWith(divOp); |
| 1041 } |
| 1042 |
| 1043 bool |
| 1044 AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) { |
| 1045 if (optLevel != CodeGenOpt::None) { |
| 1046 return false; |
| 1047 } |
| 1048 |
| 1049 if (!CI) { |
| 1050 return false; |
| 1051 } |
| 1052 |
| 1053 unsigned funcNameIdx = 0; |
| 1054 funcNameIdx = CI->getNumOperands() - 1; |
| 1055 StringRef calleeName = CI->getOperand(funcNameIdx)->getName(); |
| 1056 if (calleeName != "__amdil_image2d_read_norm" |
| 1057 && calleeName != "__amdil_image2d_read_unnorm" |
| 1058 && calleeName != "__amdil_image3d_read_norm" |
| 1059 && calleeName != "__amdil_image3d_read_unnorm") { |
| 1060 return false; |
| 1061 } |
| 1062 |
| 1063 unsigned samplerIdx = 2; |
| 1064 samplerIdx = 1; |
| 1065 Value *sampler = CI->getOperand(samplerIdx); |
| 1066 LoadInst *lInst = dyn_cast<LoadInst>(sampler); |
| 1067 if (!lInst) { |
| 1068 return false; |
| 1069 } |
| 1070 |
| 1071 if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { |
| 1072 return false; |
| 1073 } |
| 1074 |
| 1075 GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand()); |
| 1076 // If we are loading from what is not a global value, then we |
| 1077 // fail and return. |
| 1078 if (!gv) { |
| 1079 return false; |
| 1080 } |
| 1081 |
| 1082 // If we don't have an initializer or we have an initializer and |
| 1083 // the initializer is not a 32bit integer, we fail. |
| 1084 if (!gv->hasInitializer() |
| 1085 || !gv->getInitializer()->getType()->isIntegerTy(32)) { |
| 1086 return false; |
| 1087 } |
| 1088 |
| 1089 // Now that we have the global variable initializer, lets replace |
| 1090 // all uses of the load instruction with the samplerVal and |
| 1091 // reparse the __amdil_is_constant() function. |
| 1092 Constant *samplerVal = gv->getInitializer(); |
| 1093 lInst->replaceAllUsesWith(samplerVal); |
| 1094 return true; |
| 1095 } |
| 1096 |
| 1097 bool |
| 1098 AMDGPUPeepholeOpt::doInitialization(Module &M) { |
| 1099 return false; |
| 1100 } |
| 1101 |
| 1102 bool |
| 1103 AMDGPUPeepholeOpt::doFinalization(Module &M) { |
| 1104 return false; |
| 1105 } |
| 1106 |
| 1107 void |
| 1108 AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const { |
| 1109 AU.addRequired<MachineFunctionAnalysis>(); |
| 1110 FunctionPass::getAnalysisUsage(AU); |
| 1111 AU.setPreservesAll(); |
| 1112 } |
| 1113 |
| 1114 size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) { |
| 1115 size_t size = 0; |
| 1116 if (!T) { |
| 1117 return size; |
| 1118 } |
| 1119 switch (T->getTypeID()) { |
| 1120 case Type::X86_FP80TyID: |
| 1121 case Type::FP128TyID: |
| 1122 case Type::PPC_FP128TyID: |
| 1123 case Type::LabelTyID: |
| 1124 assert(0 && "These types are not supported by this backend"); |
| 1125 default: |
| 1126 case Type::FloatTyID: |
| 1127 case Type::DoubleTyID: |
| 1128 size = T->getPrimitiveSizeInBits() >> 3; |
| 1129 break; |
| 1130 case Type::PointerTyID: |
| 1131 size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr); |
| 1132 break; |
| 1133 case Type::IntegerTyID: |
| 1134 size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr); |
| 1135 break; |
| 1136 case Type::StructTyID: |
| 1137 size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr); |
| 1138 break; |
| 1139 case Type::ArrayTyID: |
| 1140 size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr); |
| 1141 break; |
| 1142 case Type::FunctionTyID: |
| 1143 size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr); |
| 1144 break; |
| 1145 case Type::VectorTyID: |
| 1146 size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr); |
| 1147 break; |
| 1148 }; |
| 1149 return size; |
| 1150 } |
| 1151 |
| 1152 size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST, |
| 1153 bool dereferencePtr) { |
| 1154 size_t size = 0; |
| 1155 if (!ST) { |
| 1156 return size; |
| 1157 } |
| 1158 Type *curType; |
| 1159 StructType::element_iterator eib; |
| 1160 StructType::element_iterator eie; |
| 1161 for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) { |
| 1162 curType = *eib; |
| 1163 size += getTypeSize(curType, dereferencePtr); |
| 1164 } |
| 1165 return size; |
| 1166 } |
| 1167 |
| 1168 size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT, |
| 1169 bool dereferencePtr) { |
| 1170 return IT ? (IT->getBitWidth() >> 3) : 0; |
| 1171 } |
| 1172 |
| 1173 size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT, |
| 1174 bool dereferencePtr) { |
| 1175 assert(0 && "Should not be able to calculate the size of an function type"); |
| 1176 return 0; |
| 1177 } |
| 1178 |
| 1179 size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT, |
| 1180 bool dereferencePtr) { |
| 1181 return (size_t)(AT ? (getTypeSize(AT->getElementType(), |
| 1182 dereferencePtr) * AT->getNumElements()) |
| 1183 : 0); |
| 1184 } |
| 1185 |
| 1186 size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT, |
| 1187 bool dereferencePtr) { |
| 1188 return VT ? (VT->getBitWidth() >> 3) : 0; |
| 1189 } |
| 1190 |
| 1191 size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT, |
| 1192 bool dereferencePtr) { |
| 1193 if (!PT) { |
| 1194 return 0; |
| 1195 } |
| 1196 Type *CT = PT->getElementType(); |
| 1197 if (CT->getTypeID() == Type::StructTyID && |
| 1198 PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { |
| 1199 return getTypeSize(dyn_cast<StructType>(CT)); |
| 1200 } else if (dereferencePtr) { |
| 1201 size_t size = 0; |
| 1202 for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) { |
| 1203 size += getTypeSize(PT->getContainedType(x), dereferencePtr); |
| 1204 } |
| 1205 return size; |
| 1206 } else { |
| 1207 return 4; |
| 1208 } |
| 1209 } |
| 1210 |
| 1211 size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT, |
| 1212 bool dereferencePtr) { |
| 1213 //assert(0 && "Should not be able to calculate the size of an opaque type"); |
| 1214 return 4; |
| 1215 } |
OLD | NEW |