| Index: src/IceTargetLoweringX86BaseImpl.h
 | 
| diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
 | 
| index b22ec6441ec82ba6e7f41bb2b401e7b10961344e..56ee04d32e6510add168fd47871608b65c2cc9e2 100644
 | 
| --- a/src/IceTargetLoweringX86BaseImpl.h
 | 
| +++ b/src/IceTargetLoweringX86BaseImpl.h
 | 
| @@ -1216,8 +1216,7 @@ void TargetX86Base<Machine>::lowerShift64(InstArithmetic::OpKind Op,
 | 
|      //   t1:ecx = c.lo & 0xff
 | 
|      //   t2 = b.lo
 | 
|      //   t3 = b.hi
 | 
| -    T_1 = makeReg(IceType_i8, Traits::RegisterSet::Reg_cl);
 | 
| -    _mov(T_1, Src1Lo);
 | 
| +    T_1 = copyToReg8(Src1Lo, Traits::RegisterSet::Reg_cl);
 | 
|      _mov(T_2, Src0Lo);
 | 
|      _mov(T_3, Src0Hi);
 | 
|      switch (Op) {
 | 
| @@ -1295,6 +1294,7 @@ void TargetX86Base<Machine>::lowerShift64(InstArithmetic::OpKind Op,
 | 
|  template <class Machine>
 | 
|  void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
 | 
|    Variable *Dest = Inst->getDest();
 | 
| +  Type Ty = Dest->getType();
 | 
|    Operand *Src0 = legalize(Inst->getSrc(0));
 | 
|    Operand *Src1 = legalize(Inst->getSrc(1));
 | 
|    if (Inst->isCommutative()) {
 | 
| @@ -1316,7 +1316,7 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
 | 
|      assert(SwapCount <= 1);
 | 
|      (void)SwapCount;
 | 
|    }
 | 
| -  if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
 | 
| +  if (!Traits::Is64Bit && Ty == IceType_i64) {
 | 
|      // These x86-32 helper-call-involved instructions are lowered in this
 | 
|      // separate switch. This is because loOperand() and hiOperand() may insert
 | 
|      // redundant instructions for constant blinding and pooling. Such redundant
 | 
| @@ -1463,7 +1463,7 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
 | 
|      }
 | 
|      return;
 | 
|    }
 | 
| -  if (isVectorType(Dest->getType())) {
 | 
| +  if (isVectorType(Ty)) {
 | 
|      // TODO: Trap on integer divide and integer modulo by zero. See:
 | 
|      // https://code.google.com/p/nativeclient/issues/detail?id=3899
 | 
|      if (llvm::isa<typename Traits::X86OperandMem>(Src1))
 | 
| @@ -1473,46 +1473,45 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
 | 
|        llvm_unreachable("Unknown arithmetic operator");
 | 
|        break;
 | 
|      case InstArithmetic::Add: {
 | 
| -      Variable *T = makeReg(Dest->getType());
 | 
| +      Variable *T = makeReg(Ty);
 | 
|        _movp(T, Src0);
 | 
|        _padd(T, Src1);
 | 
|        _movp(Dest, T);
 | 
|      } break;
 | 
|      case InstArithmetic::And: {
 | 
| -      Variable *T = makeReg(Dest->getType());
 | 
| +      Variable *T = makeReg(Ty);
 | 
|        _movp(T, Src0);
 | 
|        _pand(T, Src1);
 | 
|        _movp(Dest, T);
 | 
|      } break;
 | 
|      case InstArithmetic::Or: {
 | 
| -      Variable *T = makeReg(Dest->getType());
 | 
| +      Variable *T = makeReg(Ty);
 | 
|        _movp(T, Src0);
 | 
|        _por(T, Src1);
 | 
|        _movp(Dest, T);
 | 
|      } break;
 | 
|      case InstArithmetic::Xor: {
 | 
| -      Variable *T = makeReg(Dest->getType());
 | 
| +      Variable *T = makeReg(Ty);
 | 
|        _movp(T, Src0);
 | 
|        _pxor(T, Src1);
 | 
|        _movp(Dest, T);
 | 
|      } break;
 | 
|      case InstArithmetic::Sub: {
 | 
| -      Variable *T = makeReg(Dest->getType());
 | 
| +      Variable *T = makeReg(Ty);
 | 
|        _movp(T, Src0);
 | 
|        _psub(T, Src1);
 | 
|        _movp(Dest, T);
 | 
|      } break;
 | 
|      case InstArithmetic::Mul: {
 | 
| -      bool TypesAreValidForPmull =
 | 
| -          Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16;
 | 
| +      bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16;
 | 
|        bool InstructionSetIsValidForPmull =
 | 
| -          Dest->getType() == IceType_v8i16 || InstructionSet >= Traits::SSE4_1;
 | 
| +          Ty == IceType_v8i16 || InstructionSet >= Traits::SSE4_1;
 | 
|        if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
 | 
| -        Variable *T = makeReg(Dest->getType());
 | 
| +        Variable *T = makeReg(Ty);
 | 
|          _movp(T, Src0);
 | 
|          _pmull(T, Src0 == Src1 ? T : Src1);
 | 
|          _movp(Dest, T);
 | 
| -      } else if (Dest->getType() == IceType_v4i32) {
 | 
| +      } else if (Ty == IceType_v4i32) {
 | 
|          // Lowering sequence:
 | 
|          // Note: The mask arguments have index 0 on the left.
 | 
|          //
 | 
| @@ -1550,7 +1549,7 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
 | 
|          _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
 | 
|          _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
 | 
|          _movp(Dest, T4);
 | 
| -      } else if (Dest->getType() == IceType_v16i8) {
 | 
| +      } else if (Ty == IceType_v16i8) {
 | 
|          scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
 | 
|        } else {
 | 
|          llvm::report_fatal_error("Invalid vector multiply type");
 | 
| @@ -1566,25 +1565,25 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
 | 
|        scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
 | 
|        break;
 | 
|      case InstArithmetic::Fadd: {
 | 
| -      Variable *T = makeReg(Dest->getType());
 | 
| +      Variable *T = makeReg(Ty);
 | 
|        _movp(T, Src0);
 | 
|        _addps(T, Src1);
 | 
|        _movp(Dest, T);
 | 
|      } break;
 | 
|      case InstArithmetic::Fsub: {
 | 
| -      Variable *T = makeReg(Dest->getType());
 | 
| +      Variable *T = makeReg(Ty);
 | 
|        _movp(T, Src0);
 | 
|        _subps(T, Src1);
 | 
|        _movp(Dest, T);
 | 
|      } break;
 | 
|      case InstArithmetic::Fmul: {
 | 
| -      Variable *T = makeReg(Dest->getType());
 | 
| +      Variable *T = makeReg(Ty);
 | 
|        _movp(T, Src0);
 | 
|        _mulps(T, Src0 == Src1 ? T : Src1);
 | 
|        _movp(Dest, T);
 | 
|      } break;
 | 
|      case InstArithmetic::Fdiv: {
 | 
| -      Variable *T = makeReg(Dest->getType());
 | 
| +      Variable *T = makeReg(Ty);
 | 
|        _movp(T, Src0);
 | 
|        _divps(T, Src1);
 | 
|        _movp(Dest, T);
 | 
| @@ -1633,13 +1632,13 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
 | 
|      }
 | 
|      // The 8-bit version of imul only allows the form "imul r/m8" where T must
 | 
|      // be in al.
 | 
| -    if (isByteSizedArithType(Dest->getType())) {
 | 
| +    if (isByteSizedArithType(Ty)) {
 | 
|        _mov(T, Src0, Traits::RegisterSet::Reg_al);
 | 
|        Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
 | 
|        _imul(T, Src0 == Src1 ? T : Src1);
 | 
|        _mov(Dest, T);
 | 
|      } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
 | 
| -      T = makeReg(Dest->getType());
 | 
| +      T = makeReg(Ty);
 | 
|        _imul_imm(T, Src0, ImmConst);
 | 
|        _mov(Dest, T);
 | 
|      } else {
 | 
| @@ -1650,76 +1649,51 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
 | 
|      break;
 | 
|    case InstArithmetic::Shl:
 | 
|      _mov(T, Src0);
 | 
| -    if (!llvm::isa<ConstantInteger32>(Src1)) {
 | 
| -      Variable *Cl = makeReg(IceType_i8, Traits::RegisterSet::Reg_cl);
 | 
| -      _mov(Cl, Src1);
 | 
| -      Src1 = Cl;
 | 
| -    }
 | 
| +    if (!llvm::isa<ConstantInteger32>(Src1))
 | 
| +      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
 | 
|      _shl(T, Src1);
 | 
|      _mov(Dest, T);
 | 
|      break;
 | 
|    case InstArithmetic::Lshr:
 | 
|      _mov(T, Src0);
 | 
| -    if (!llvm::isa<ConstantInteger32>(Src1)) {
 | 
| -      Variable *Cl = makeReg(IceType_i8, Traits::RegisterSet::Reg_cl);
 | 
| -      _mov(Cl, Src1);
 | 
| -      Src1 = Cl;
 | 
| -    }
 | 
| +    if (!llvm::isa<ConstantInteger32>(Src1))
 | 
| +      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
 | 
|      _shr(T, Src1);
 | 
|      _mov(Dest, T);
 | 
|      break;
 | 
|    case InstArithmetic::Ashr:
 | 
|      _mov(T, Src0);
 | 
| -    if (!llvm::isa<ConstantInteger32>(Src1)) {
 | 
| -      Variable *Cl = makeReg(IceType_i8, Traits::RegisterSet::Reg_cl);
 | 
| -      _mov(Cl, Src1);
 | 
| -      Src1 = Cl;
 | 
| -    }
 | 
| +    if (!llvm::isa<ConstantInteger32>(Src1))
 | 
| +      Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
 | 
|      _sar(T, Src1);
 | 
|      _mov(Dest, T);
 | 
|      break;
 | 
| -  case InstArithmetic::Udiv:
 | 
| +  case InstArithmetic::Udiv: {
 | 
|      // div and idiv are the few arithmetic operators that do not allow
 | 
|      // immediates as the operand.
 | 
|      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
 | 
| -    if (isByteSizedArithType(Dest->getType())) {
 | 
| -      // For 8-bit unsigned division we need to zero-extend al into ah. A mov
 | 
| -      // $0, %ah (or xor %ah, %ah) would work just fine, except that the x86-64
 | 
| -      // assembler refuses to encode %ah (encoding %spl with a REX prefix
 | 
| -      // instead.) Accessing %ah in 64-bit is "tricky" as you can't encode %ah
 | 
| -      // with any other 8-bit register except for %a[lh], %b[lh], %c[lh], and
 | 
| -      // d[%lh], which means the X86 target lowering (and the register
 | 
| -      // allocator) would have to be aware of this restriction. For now, we
 | 
| -      // simply zero %eax completely, and move the dividend into %al.
 | 
| -      Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
 | 
| -      Context.insert(InstFakeDef::create(Func, T_eax));
 | 
| -      _xor(T_eax, T_eax);
 | 
| -      _mov(T, Src0, Traits::RegisterSet::Reg_al);
 | 
| -      _div(T, Src1, T);
 | 
| -      _mov(Dest, T);
 | 
| -      Context.insert(InstFakeUse::create(Func, T_eax));
 | 
| -    } else {
 | 
| -      Type Ty = Dest->getType();
 | 
| -      uint32_t Eax = Traits::RegisterSet::Reg_eax;
 | 
| -      uint32_t Edx = Traits::RegisterSet::Reg_edx;
 | 
| -      switch (Ty) {
 | 
| -      default:
 | 
| -        llvm_unreachable("Bad type for udiv");
 | 
| -      // fallthrough
 | 
| -      case IceType_i32:
 | 
| -        break;
 | 
| -      case IceType_i16:
 | 
| -        Eax = Traits::RegisterSet::Reg_ax;
 | 
| -        Edx = Traits::RegisterSet::Reg_dx;
 | 
| -        break;
 | 
| -      }
 | 
| -      Constant *Zero = Ctx->getConstantZero(Ty);
 | 
| -      _mov(T, Src0, Eax);
 | 
| -      _mov(T_edx, Zero, Edx);
 | 
| -      _div(T, Src1, T_edx);
 | 
| -      _mov(Dest, T);
 | 
| +    uint32_t Eax = Traits::RegisterSet::Reg_eax;
 | 
| +    uint32_t Edx = Traits::RegisterSet::Reg_edx;
 | 
| +    switch (Ty) {
 | 
| +    default:
 | 
| +      llvm_unreachable("Bad type for udiv");
 | 
| +    // fallthrough
 | 
| +    case IceType_i32:
 | 
| +      break;
 | 
| +    case IceType_i16:
 | 
| +      Eax = Traits::RegisterSet::Reg_ax;
 | 
| +      Edx = Traits::RegisterSet::Reg_dx;
 | 
| +      break;
 | 
| +    case IceType_i8:
 | 
| +      Eax = Traits::RegisterSet::Reg_al;
 | 
| +      Edx = Traits::RegisterSet::Reg_ah;
 | 
| +      break;
 | 
|      }
 | 
| -    break;
 | 
| +    _mov(T, Src0, Eax);
 | 
| +    _mov(T_edx, Ctx->getConstantZero(Ty), Edx);
 | 
| +    _div(T, Src1, T_edx);
 | 
| +    _mov(Dest, T);
 | 
| +  } break;
 | 
|    case InstArithmetic::Sdiv:
 | 
|      // TODO(stichnot): Enable this after doing better performance and cross
 | 
|      // testing.
 | 
| @@ -1731,7 +1705,6 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
 | 
|          uint32_t UDivisor = static_cast<uint32_t>(Divisor);
 | 
|          if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
 | 
|            uint32_t LogDiv = llvm::Log2_32(UDivisor);
 | 
| -          Type Ty = Dest->getType();
 | 
|            // LLVM does the following for dest=src/(1<<log):
 | 
|            //   t=src
 | 
|            //   sar t,typewidth-1 // -1 if src is negative, 0 if not
 | 
| @@ -1757,7 +1730,7 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
 | 
|        }
 | 
|      }
 | 
|      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
 | 
| -    switch (Type Ty = Dest->getType()) {
 | 
| +    switch (Ty) {
 | 
|      default:
 | 
|        llvm_unreachable("Bad type for sdiv");
 | 
|      // fallthrough
 | 
| @@ -1778,47 +1751,32 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
 | 
|      _idiv(T, Src1, T_edx);
 | 
|      _mov(Dest, T);
 | 
|      break;
 | 
| -  case InstArithmetic::Urem:
 | 
| +  case InstArithmetic::Urem: {
 | 
|      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
 | 
| -    if (isByteSizedArithType(Dest->getType())) {
 | 
| -      Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
 | 
| -      Context.insert(InstFakeDef::create(Func, T_eax));
 | 
| -      _xor(T_eax, T_eax);
 | 
| -      _mov(T, Src0, Traits::RegisterSet::Reg_al);
 | 
| -      _div(T, Src1, T);
 | 
| -      // shr $8, %eax shifts ah (i.e., the 8 bit remainder) into al. We don't
 | 
| -      // mov %ah, %al because it would make x86-64 codegen more complicated. If
 | 
| -      // this ever becomes a problem we can introduce a pseudo rem instruction
 | 
| -      // that returns the remainder in %al directly (and uses a mov for copying
 | 
| -      // %ah to %al.)
 | 
| -      static constexpr uint8_t AlSizeInBits = 8;
 | 
| -      _shr(T_eax, Ctx->getConstantInt8(AlSizeInBits));
 | 
| -      _mov(Dest, T);
 | 
| -      Context.insert(InstFakeUse::create(Func, T_eax));
 | 
| -    } else {
 | 
| -      Type Ty = Dest->getType();
 | 
| -      uint32_t Eax = Traits::RegisterSet::Reg_eax;
 | 
| -      uint32_t Edx = Traits::RegisterSet::Reg_edx;
 | 
| -      switch (Ty) {
 | 
| -      default:
 | 
| -        llvm_unreachable("Bad type for urem");
 | 
| -      // fallthrough
 | 
| -      case IceType_i32:
 | 
| -        break;
 | 
| -      case IceType_i16:
 | 
| -        Eax = Traits::RegisterSet::Reg_ax;
 | 
| -        Edx = Traits::RegisterSet::Reg_dx;
 | 
| -        break;
 | 
| -      }
 | 
| -      Constant *Zero = Ctx->getConstantZero(Ty);
 | 
| -      T_edx = makeReg(Dest->getType(), Edx);
 | 
| -      _mov(T_edx, Zero);
 | 
| -      _mov(T, Src0, Eax);
 | 
| -      _div(T_edx, Src1, T);
 | 
| -      _mov(Dest, T_edx);
 | 
| +    uint32_t Eax = Traits::RegisterSet::Reg_eax;
 | 
| +    uint32_t Edx = Traits::RegisterSet::Reg_edx;
 | 
| +    switch (Ty) {
 | 
| +    default:
 | 
| +      llvm_unreachable("Bad type for urem");
 | 
| +    // fallthrough
 | 
| +    case IceType_i32:
 | 
| +      break;
 | 
| +    case IceType_i16:
 | 
| +      Eax = Traits::RegisterSet::Reg_ax;
 | 
| +      Edx = Traits::RegisterSet::Reg_dx;
 | 
| +      break;
 | 
| +    case IceType_i8:
 | 
| +      Eax = Traits::RegisterSet::Reg_al;
 | 
| +      Edx = Traits::RegisterSet::Reg_ah;
 | 
| +      break;
 | 
|      }
 | 
| -    break;
 | 
| -  case InstArithmetic::Srem:
 | 
| +    T_edx = makeReg(Ty, Edx);
 | 
| +    _mov(T_edx, Ctx->getConstantZero(Ty));
 | 
| +    _mov(T, Src0, Eax);
 | 
| +    _div(T_edx, Src1, T);
 | 
| +    _mov(Dest, T_edx);
 | 
| +  } break;
 | 
| +  case InstArithmetic::Srem: {
 | 
|      // TODO(stichnot): Enable this after doing better performance and cross
 | 
|      // testing.
 | 
|      if (false && Ctx->getFlags().getOptLevel() >= Opt_1) {
 | 
| @@ -1829,7 +1787,6 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
 | 
|          uint32_t UDivisor = static_cast<uint32_t>(Divisor);
 | 
|          if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
 | 
|            uint32_t LogDiv = llvm::Log2_32(UDivisor);
 | 
| -          Type Ty = Dest->getType();
 | 
|            // LLVM does the following for dest=src%(1<<log):
 | 
|            //   t=src
 | 
|            //   sar t,typewidth-1 // -1 if src is negative, 0 if not
 | 
| @@ -1860,37 +1817,29 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
 | 
|        }
 | 
|      }
 | 
|      Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
 | 
| -    switch (Type Ty = Dest->getType()) {
 | 
| +    uint32_t Eax = Traits::RegisterSet::Reg_eax;
 | 
| +    uint32_t Edx = Traits::RegisterSet::Reg_edx;
 | 
| +    switch (Ty) {
 | 
|      default:
 | 
|        llvm_unreachable("Bad type for srem");
 | 
|      // fallthrough
 | 
|      case IceType_i32:
 | 
| -      T_edx = makeReg(Ty, Traits::RegisterSet::Reg_edx);
 | 
| -      _mov(T, Src0, Traits::RegisterSet::Reg_eax);
 | 
| -      _cbwdq(T_edx, T);
 | 
| -      _idiv(T_edx, Src1, T);
 | 
| -      _mov(Dest, T_edx);
 | 
|        break;
 | 
|      case IceType_i16:
 | 
| -      T_edx = makeReg(Ty, Traits::RegisterSet::Reg_dx);
 | 
| -      _mov(T, Src0, Traits::RegisterSet::Reg_ax);
 | 
| -      _cbwdq(T_edx, T);
 | 
| -      _idiv(T_edx, Src1, T);
 | 
| -      _mov(Dest, T_edx);
 | 
| +      Eax = Traits::RegisterSet::Reg_ax;
 | 
| +      Edx = Traits::RegisterSet::Reg_dx;
 | 
|        break;
 | 
|      case IceType_i8:
 | 
| -      T_edx = makeReg(IceType_i16, Traits::RegisterSet::Reg_ax);
 | 
| -      // TODO(stichnot): Use register ah for T_edx, and remove the _shr().
 | 
| -      // T_edx = makeReg(Ty, Traits::RegisterSet::Reg_ah);
 | 
| -      _mov(T, Src0, Traits::RegisterSet::Reg_al);
 | 
| -      _cbwdq(T_edx, T);
 | 
| -      _idiv(T_edx, Src1, T);
 | 
| -      static constexpr uint8_t AlSizeInBits = 8;
 | 
| -      _shr(T_edx, Ctx->getConstantInt8(AlSizeInBits));
 | 
| -      _mov(Dest, T_edx);
 | 
| +      Eax = Traits::RegisterSet::Reg_al;
 | 
| +      Edx = Traits::RegisterSet::Reg_ah;
 | 
|        break;
 | 
|      }
 | 
| -    break;
 | 
| +    T_edx = makeReg(Ty, Edx);
 | 
| +    _mov(T, Src0, Eax);
 | 
| +    _cbwdq(T_edx, T);
 | 
| +    _idiv(T_edx, Src1, T);
 | 
| +    _mov(Dest, T_edx);
 | 
| +  } break;
 | 
|    case InstArithmetic::Fadd:
 | 
|      _mov(T, Src0);
 | 
|      _addss(T, Src1);
 | 
| @@ -1913,7 +1862,6 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
 | 
|      break;
 | 
|    case InstArithmetic::Frem: {
 | 
|      constexpr SizeT MaxSrcs = 2;
 | 
| -    Type Ty = Dest->getType();
 | 
|      InstCall *Call = makeHelperCall(
 | 
|          isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64, Dest, MaxSrcs);
 | 
|      Call->addArg(Src0);
 | 
| @@ -1991,6 +1939,7 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|    // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
 | 
|    InstCast::OpKind CastKind = Inst->getCastKind();
 | 
|    Variable *Dest = Inst->getDest();
 | 
| +  Type DestTy = Dest->getType();
 | 
|    switch (CastKind) {
 | 
|    default:
 | 
|      Func->setError("Cast type not supported");
 | 
| @@ -2003,15 +1952,14 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|      // we're unlikely to see something like that in the bitcode that the
 | 
|      // optimizer wouldn't have already taken care of.
 | 
|      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
 | 
| -    if (isVectorType(Dest->getType())) {
 | 
| -      Type DestTy = Dest->getType();
 | 
| +    if (isVectorType(DestTy)) {
 | 
|        if (DestTy == IceType_v16i8) {
 | 
|          // onemask = materialize(1,1,...); dst = (src & onemask) > 0
 | 
| -        Variable *OneMask = makeVectorOfOnes(Dest->getType());
 | 
| +        Variable *OneMask = makeVectorOfOnes(DestTy);
 | 
|          Variable *T = makeReg(DestTy);
 | 
|          _movp(T, Src0RM);
 | 
|          _pand(T, OneMask);
 | 
| -        Variable *Zeros = makeVectorOfZeros(Dest->getType());
 | 
| +        Variable *Zeros = makeVectorOfZeros(DestTy);
 | 
|          _pcmpgt(T, Zeros);
 | 
|          _movp(Dest, T);
 | 
|        } else {
 | 
| @@ -2026,7 +1974,7 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|          _psra(T, ShiftConstant);
 | 
|          _movp(Dest, T);
 | 
|        }
 | 
| -    } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
 | 
| +    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
 | 
|        // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
 | 
|        Constant *Shift = Ctx->getConstantInt32(31);
 | 
|        Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
 | 
| @@ -2053,12 +2001,10 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|        // shl t1, dst_bitwidth - 1
 | 
|        // sar t1, dst_bitwidth - 1
 | 
|        // dst = t1
 | 
| -      size_t DestBits =
 | 
| -          Traits::X86_CHAR_BIT * typeWidthInBytes(Dest->getType());
 | 
| +      size_t DestBits = Traits::X86_CHAR_BIT * typeWidthInBytes(DestTy);
 | 
|        Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
 | 
| -      Variable *T = makeReg(Dest->getType());
 | 
| -      if (typeWidthInBytes(Dest->getType()) <=
 | 
| -          typeWidthInBytes(Src0RM->getType())) {
 | 
| +      Variable *T = makeReg(DestTy);
 | 
| +      if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) {
 | 
|          _mov(T, Src0RM);
 | 
|        } else {
 | 
|          // Widen the source using movsx or movzx. (It doesn't matter which one,
 | 
| @@ -2070,7 +2016,7 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|        _mov(Dest, T);
 | 
|      } else {
 | 
|        // t1 = movsx src; dst = t1
 | 
| -      Variable *T = makeReg(Dest->getType());
 | 
| +      Variable *T = makeReg(DestTy);
 | 
|        _movsx(T, Src0RM);
 | 
|        _mov(Dest, T);
 | 
|      }
 | 
| @@ -2078,15 +2024,14 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|    }
 | 
|    case InstCast::Zext: {
 | 
|      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
 | 
| -    if (isVectorType(Dest->getType())) {
 | 
| +    if (isVectorType(DestTy)) {
 | 
|        // onemask = materialize(1,1,...); dest = onemask & src
 | 
| -      Type DestTy = Dest->getType();
 | 
|        Variable *OneMask = makeVectorOfOnes(DestTy);
 | 
|        Variable *T = makeReg(DestTy);
 | 
|        _movp(T, Src0RM);
 | 
|        _pand(T, OneMask);
 | 
|        _movp(Dest, T);
 | 
| -    } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
 | 
| +    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
 | 
|        // t1=movzx src; dst.lo=t1; dst.hi=0
 | 
|        Constant *Zero = Ctx->getConstantZero(IceType_i32);
 | 
|        Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
 | 
| @@ -2101,7 +2046,6 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|        _mov(DestHi, Zero);
 | 
|      } else if (Src0RM->getType() == IceType_i1) {
 | 
|        // t = Src0RM; Dest = t
 | 
| -      Type DestTy = Dest->getType();
 | 
|        Variable *T = nullptr;
 | 
|        if (DestTy == IceType_i8) {
 | 
|          _mov(T, Src0RM);
 | 
| @@ -2117,32 +2061,40 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|        _mov(Dest, T);
 | 
|      } else {
 | 
|        // t1 = movzx src; dst = t1
 | 
| -      Variable *T = makeReg(Dest->getType());
 | 
| +      Variable *T = makeReg(DestTy);
 | 
|        _movzx(T, Src0RM);
 | 
|        _mov(Dest, T);
 | 
|      }
 | 
|      break;
 | 
|    }
 | 
|    case InstCast::Trunc: {
 | 
| -    if (isVectorType(Dest->getType())) {
 | 
| +    if (isVectorType(DestTy)) {
 | 
|        // onemask = materialize(1,1,...); dst = src & onemask
 | 
|        Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
 | 
|        Type Src0Ty = Src0RM->getType();
 | 
|        Variable *OneMask = makeVectorOfOnes(Src0Ty);
 | 
| -      Variable *T = makeReg(Dest->getType());
 | 
| +      Variable *T = makeReg(DestTy);
 | 
|        _movp(T, Src0RM);
 | 
|        _pand(T, OneMask);
 | 
|        _movp(Dest, T);
 | 
| +    } else if (DestTy == IceType_i1 || DestTy == IceType_i8) {
 | 
| +      // Make sure we truncate from and into valid registers.
 | 
| +      Operand *Src0 = legalizeUndef(Inst->getSrc(0));
 | 
| +      if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
 | 
| +        Src0 = loOperand(Src0);
 | 
| +      Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
 | 
| +      Variable *T = copyToReg8(Src0RM);
 | 
| +      if (DestTy == IceType_i1)
 | 
| +        _and(T, Ctx->getConstantInt1(1));
 | 
| +      _mov(Dest, T);
 | 
|      } else {
 | 
|        Operand *Src0 = legalizeUndef(Inst->getSrc(0));
 | 
|        if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
 | 
|          Src0 = loOperand(Src0);
 | 
|        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
 | 
|        // t1 = trunc Src0RM; Dest = t1
 | 
| -      Variable *T = nullptr;
 | 
| +      Variable *T = makeReg(DestTy);
 | 
|        _mov(T, Src0RM);
 | 
| -      if (Dest->getType() == IceType_i1)
 | 
| -        _and(T, Ctx->getConstantInt1(1));
 | 
|        _mov(Dest, T);
 | 
|      }
 | 
|      break;
 | 
| @@ -2151,22 +2103,22 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|    case InstCast::Fpext: {
 | 
|      Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
 | 
|      // t1 = cvt Src0RM; Dest = t1
 | 
| -    Variable *T = makeReg(Dest->getType());
 | 
| +    Variable *T = makeReg(DestTy);
 | 
|      _cvt(T, Src0RM, Traits::Insts::Cvt::Float2float);
 | 
|      _mov(Dest, T);
 | 
|      break;
 | 
|    }
 | 
|    case InstCast::Fptosi:
 | 
| -    if (isVectorType(Dest->getType())) {
 | 
| -      assert(Dest->getType() == IceType_v4i32 &&
 | 
| +    if (isVectorType(DestTy)) {
 | 
| +      assert(DestTy == IceType_v4i32 &&
 | 
|               Inst->getSrc(0)->getType() == IceType_v4f32);
 | 
|        Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
 | 
|        if (llvm::isa<typename Traits::X86OperandMem>(Src0RM))
 | 
|          Src0RM = legalizeToReg(Src0RM);
 | 
| -      Variable *T = makeReg(Dest->getType());
 | 
| +      Variable *T = makeReg(DestTy);
 | 
|        _cvt(T, Src0RM, Traits::Insts::Cvt::Tps2dq);
 | 
|        _movp(Dest, T);
 | 
| -    } else if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
 | 
| +    } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
 | 
|        constexpr SizeT MaxSrcs = 1;
 | 
|        Type SrcType = Inst->getSrc(0)->getType();
 | 
|        InstCall *Call =
 | 
| @@ -2179,40 +2131,44 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|        Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
 | 
|        // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
 | 
|        Variable *T_1 = nullptr;
 | 
| -      if (Traits::Is64Bit && Dest->getType() == IceType_i64) {
 | 
| +      if (Traits::Is64Bit && DestTy == IceType_i64) {
 | 
|          T_1 = makeReg(IceType_i64);
 | 
|        } else {
 | 
| -        assert(Dest->getType() != IceType_i64);
 | 
| +        assert(DestTy != IceType_i64);
 | 
|          T_1 = makeReg(IceType_i32);
 | 
|        }
 | 
|        // cvt() requires its integer argument to be a GPR.
 | 
| -      Variable *T_2 = makeReg(Dest->getType());
 | 
| +      Variable *T_2 = makeReg(DestTy);
 | 
| +      if (isByteSizedType(DestTy)) {
 | 
| +        assert(T_1->getType() == IceType_i32);
 | 
| +        T_1->setRegClass(RCX86_Is32To8);
 | 
| +        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
 | 
| +      }
 | 
|        _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
 | 
|        _mov(T_2, T_1); // T_1 and T_2 may have different integer types
 | 
| -      if (Dest->getType() == IceType_i1)
 | 
| +      if (DestTy == IceType_i1)
 | 
|          _and(T_2, Ctx->getConstantInt1(1));
 | 
|        _mov(Dest, T_2);
 | 
|      }
 | 
|      break;
 | 
|    case InstCast::Fptoui:
 | 
| -    if (isVectorType(Dest->getType())) {
 | 
| -      assert(Dest->getType() == IceType_v4i32 &&
 | 
| +    if (isVectorType(DestTy)) {
 | 
| +      assert(DestTy == IceType_v4i32 &&
 | 
|               Inst->getSrc(0)->getType() == IceType_v4f32);
 | 
|        constexpr SizeT MaxSrcs = 1;
 | 
|        InstCall *Call = makeHelperCall(H_fptoui_4xi32_f32, Dest, MaxSrcs);
 | 
|        Call->addArg(Inst->getSrc(0));
 | 
|        lowerCall(Call);
 | 
| -    } else if (Dest->getType() == IceType_i64 ||
 | 
| -               (!Traits::Is64Bit && Dest->getType() == IceType_i32)) {
 | 
| +    } else if (DestTy == IceType_i64 ||
 | 
| +               (!Traits::Is64Bit && DestTy == IceType_i32)) {
 | 
|        // Use a helper for both x86-32 and x86-64.
 | 
|        constexpr SizeT MaxSrcs = 1;
 | 
| -      Type DestType = Dest->getType();
 | 
|        Type SrcType = Inst->getSrc(0)->getType();
 | 
|        IceString TargetString;
 | 
|        if (Traits::Is64Bit) {
 | 
|          TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i64
 | 
|                                                           : H_fptoui_f64_i64;
 | 
| -      } else if (isInt32Asserting32Or64(DestType)) {
 | 
| +      } else if (isInt32Asserting32Or64(DestTy)) {
 | 
|          TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i32
 | 
|                                                           : H_fptoui_f64_i32;
 | 
|        } else {
 | 
| @@ -2226,39 +2182,43 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|      } else {
 | 
|        Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
 | 
|        // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
 | 
| -      assert(Dest->getType() != IceType_i64);
 | 
| +      assert(DestTy != IceType_i64);
 | 
|        Variable *T_1 = nullptr;
 | 
| -      if (Traits::Is64Bit && Dest->getType() == IceType_i32) {
 | 
| +      if (Traits::Is64Bit && DestTy == IceType_i32) {
 | 
|          T_1 = makeReg(IceType_i64);
 | 
|        } else {
 | 
| -        assert(Dest->getType() != IceType_i32);
 | 
| +        assert(DestTy != IceType_i32);
 | 
|          T_1 = makeReg(IceType_i32);
 | 
|        }
 | 
| -      Variable *T_2 = makeReg(Dest->getType());
 | 
| +      Variable *T_2 = makeReg(DestTy);
 | 
| +      if (isByteSizedType(DestTy)) {
 | 
| +        assert(T_1->getType() == IceType_i32);
 | 
| +        T_1->setRegClass(RCX86_Is32To8);
 | 
| +        T_2->setRegClass(RCX86_IsTrunc8Rcvr);
 | 
| +      }
 | 
|        _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
 | 
|        _mov(T_2, T_1); // T_1 and T_2 may have different integer types
 | 
| -      if (Dest->getType() == IceType_i1)
 | 
| +      if (DestTy == IceType_i1)
 | 
|          _and(T_2, Ctx->getConstantInt1(1));
 | 
|        _mov(Dest, T_2);
 | 
|      }
 | 
|      break;
 | 
|    case InstCast::Sitofp:
 | 
| -    if (isVectorType(Dest->getType())) {
 | 
| -      assert(Dest->getType() == IceType_v4f32 &&
 | 
| +    if (isVectorType(DestTy)) {
 | 
| +      assert(DestTy == IceType_v4f32 &&
 | 
|               Inst->getSrc(0)->getType() == IceType_v4i32);
 | 
|        Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
 | 
|        if (llvm::isa<typename Traits::X86OperandMem>(Src0RM))
 | 
|          Src0RM = legalizeToReg(Src0RM);
 | 
| -      Variable *T = makeReg(Dest->getType());
 | 
| +      Variable *T = makeReg(DestTy);
 | 
|        _cvt(T, Src0RM, Traits::Insts::Cvt::Dq2ps);
 | 
|        _movp(Dest, T);
 | 
|      } else if (!Traits::Is64Bit && Inst->getSrc(0)->getType() == IceType_i64) {
 | 
|        // Use a helper for x86-32.
 | 
|        constexpr SizeT MaxSrcs = 1;
 | 
| -      Type DestType = Dest->getType();
 | 
|        InstCall *Call =
 | 
| -          makeHelperCall(isFloat32Asserting32Or64(DestType) ? H_sitofp_i64_f32
 | 
| -                                                            : H_sitofp_i64_f64,
 | 
| +          makeHelperCall(isFloat32Asserting32Or64(DestTy) ? H_sitofp_i64_f32
 | 
| +                                                          : H_sitofp_i64_f64,
 | 
|                           Dest, MaxSrcs);
 | 
|        // TODO: Call the correct compiler-rt helper function.
 | 
|        Call->addArg(Inst->getSrc(0));
 | 
| @@ -2275,7 +2235,7 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|          assert(Src0RM->getType() != IceType_i64);
 | 
|          T_1 = makeReg(IceType_i32);
 | 
|        }
 | 
| -      Variable *T_2 = makeReg(Dest->getType());
 | 
| +      Variable *T_2 = makeReg(DestTy);
 | 
|        if (Src0RM->getType() == T_1->getType())
 | 
|          _mov(T_1, Src0RM);
 | 
|        else
 | 
| @@ -2287,8 +2247,7 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|    case InstCast::Uitofp: {
 | 
|      Operand *Src0 = Inst->getSrc(0);
 | 
|      if (isVectorType(Src0->getType())) {
 | 
| -      assert(Dest->getType() == IceType_v4f32 &&
 | 
| -             Src0->getType() == IceType_v4i32);
 | 
| +      assert(DestTy == IceType_v4f32 && Src0->getType() == IceType_v4i32);
 | 
|        constexpr SizeT MaxSrcs = 1;
 | 
|        InstCall *Call = makeHelperCall(H_uitofp_4xi32_4xf32, Dest, MaxSrcs);
 | 
|        Call->addArg(Src0);
 | 
| @@ -2298,14 +2257,13 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|        // Use a helper for x86-32 and x86-64. Also use a helper for i32 on
 | 
|        // x86-32.
 | 
|        constexpr SizeT MaxSrcs = 1;
 | 
| -      Type DestType = Dest->getType();
 | 
|        IceString TargetString;
 | 
|        if (isInt32Asserting32Or64(Src0->getType())) {
 | 
| -        TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i32_f32
 | 
| -                                                          : H_uitofp_i32_f64;
 | 
| +        TargetString = isFloat32Asserting32Or64(DestTy) ? H_uitofp_i32_f32
 | 
| +                                                        : H_uitofp_i32_f64;
 | 
|        } else {
 | 
| -        TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i64_f32
 | 
| -                                                          : H_uitofp_i64_f64;
 | 
| +        TargetString = isFloat32Asserting32Or64(DestTy) ? H_uitofp_i64_f32
 | 
| +                                                        : H_uitofp_i64_f64;
 | 
|        }
 | 
|        InstCall *Call = makeHelperCall(TargetString, Dest, MaxSrcs);
 | 
|        Call->addArg(Src0);
 | 
| @@ -2323,7 +2281,7 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|          assert(Traits::Is64Bit || Src0RM->getType() != IceType_i32);
 | 
|          T_1 = makeReg(IceType_i32);
 | 
|        }
 | 
| -      Variable *T_2 = makeReg(Dest->getType());
 | 
| +      Variable *T_2 = makeReg(DestTy);
 | 
|        if (Src0RM->getType() == T_1->getType())
 | 
|          _mov(T_1, Src0RM);
 | 
|        else
 | 
| @@ -2335,12 +2293,12 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|    }
 | 
|    case InstCast::Bitcast: {
 | 
|      Operand *Src0 = Inst->getSrc(0);
 | 
| -    if (Dest->getType() == Src0->getType()) {
 | 
| +    if (DestTy == Src0->getType()) {
 | 
|        InstAssign *Assign = InstAssign::create(Func, Dest, Src0);
 | 
|        lowerAssign(Assign);
 | 
|        return;
 | 
|      }
 | 
| -    switch (Dest->getType()) {
 | 
| +    switch (DestTy) {
 | 
|      default:
 | 
|        llvm_unreachable("Unexpected Bitcast dest type");
 | 
|      case IceType_i8: {
 | 
| @@ -2358,11 +2316,9 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|      case IceType_i32:
 | 
|      case IceType_f32: {
 | 
|        Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
 | 
| -      Type DestType = Dest->getType();
 | 
|        Type SrcType = Src0RM->getType();
 | 
| -      (void)DestType;
 | 
| -      assert((DestType == IceType_i32 && SrcType == IceType_f32) ||
 | 
| -             (DestType == IceType_f32 && SrcType == IceType_i32));
 | 
| +      assert((DestTy == IceType_i32 && SrcType == IceType_f32) ||
 | 
| +             (DestTy == IceType_f32 && SrcType == IceType_i32));
 | 
|        // a.i32 = bitcast b.f32 ==>
 | 
|        //   t.f32 = b.f32
 | 
|        //   s.f32 = spill t.f32
 | 
| @@ -2436,7 +2392,7 @@ void TargetX86Base<Machine>::lowerCast(const InstCast *Inst) {
 | 
|        } else {
 | 
|          Src0 = legalize(Src0);
 | 
|          if (llvm::isa<typename Traits::X86OperandMem>(Src0)) {
 | 
| -          Variable *T = Func->makeVariable(Dest->getType());
 | 
| +          Variable *T = Func->makeVariable(DestTy);
 | 
|            _movq(T, Src0);
 | 
|            _movq(Dest, T);
 | 
|            break;
 | 
| @@ -3037,17 +2993,21 @@ void TargetX86Base<Machine>::lowerInsertElement(const InstInsertElement *Inst) {
 | 
|          legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
 | 
|      Variable *T = makeReg(Ty);
 | 
|      _movp(T, SourceVectRM);
 | 
| -    if (Ty == IceType_v4f32)
 | 
| +    if (Ty == IceType_v4f32) {
 | 
|        _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
 | 
| -    else
 | 
| -      // TODO(stichnot): For the pinsrb and pinsrw instructions, when the source
 | 
| -      // operand is a register, it must be a full r32 register like eax, and not
 | 
| -      // ax/al/ah.  For filetype=asm, InstX86Pinsr<Machine>::emit() compensates
 | 
| -      // for the use of r16 and r8 by converting them through getBaseReg(),
 | 
| -      // while emitIAS() validates that the original and base register encodings
 | 
| -      // are the same.  But for an "interior" register like ah, it should
 | 
| -      // probably be copied into an r32 via movzx so that the types work out.
 | 
| +    } else {
 | 
| +      // For the pinsrb and pinsrw instructions, when the source operand is a
 | 
| +      // register, it must be a full r32 register like eax, and not ax/al/ah.
 | 
| +      // For filetype=asm, InstX86Pinsr<Machine>::emit() compensates for the use
 | 
| +      // of r16 and r8 by converting them through getBaseReg(), while emitIAS()
 | 
| +      // validates that the original and base register encodings are the same.
 | 
| +      if (ElementRM->getType() == IceType_i8 &&
 | 
| +          llvm::isa<Variable>(ElementRM)) {
 | 
| +        // Don't use ah/bh/ch/dh for pinsrb.
 | 
| +        ElementRM = copyToReg8(ElementRM);
 | 
| +      }
 | 
|        _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
 | 
| +    }
 | 
|      _movp(Inst->getDest(), T);
 | 
|    } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
 | 
|      // Use shufps or movss.
 | 
| @@ -5354,6 +5314,67 @@ TargetX86Base<Machine>::getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
 | 
|    return Traits::X86OperandMem::create(Func, Ty, Loc, ConstantOffset);
 | 
|  }
 | 
|  
 | 
| +/// Lowering helper to copy a scalar integer source operand into some 8-bit GPR.
 | 
| +/// Src is assumed to already be legalized.  If the source operand is known to
 | 
| +/// be a memory or immediate operand, a simple mov will suffice.  But if the
 | 
| +/// source operand can be a physical register, then it must first be copied into
 | 
| +/// a physical register that is truncable to 8-bit, then truncated into a
 | 
| +/// physical register that can receive a truncation, and finally copied into the
 | 
| +/// result 8-bit register (which in general can be any 8-bit register).  For
 | 
| +/// example, moving %ebp into %ah may be accomplished as:
 | 
| +///   movl %ebp, %edx
 | 
| +///   mov_trunc %edx, %dl  // this redundant assignment is ultimately elided
 | 
| +///   movb %dl, %ah
 | 
| +/// On the other hand, moving a memory or immediate operand into ah:
 | 
| +///   movb 4(%ebp), %ah
 | 
| +///   movb $my_imm, %ah
 | 
| +///
 | 
| +/// Note #1.  On a 64-bit target, the "movb 4(%ebp), %ah" is likely not
 | 
| +/// encodable, so RegNum=Reg_ah should NOT be given as an argument.  Instead,
 | 
| +/// use RegNum=NoRegister and then let the caller do a separate copy into
 | 
| +/// Reg_ah.
 | 
| +///
 | 
| +/// Note #2.  ConstantRelocatable operands are also put through this process
 | 
| +/// (not truncated directly) because our ELF emitter does R_386_32 relocations
 | 
| +/// but not R_386_8 relocations.
 | 
| +///
 | 
| +/// Note #3.  If Src is a Variable, the result will be an infinite-weight i8
 | 
| +/// Variable with the RCX86_IsTrunc8Rcvr register class.  As such, this helper
 | 
| +/// is a convenient way to prevent ah/bh/ch/dh from being an (invalid) argument
 | 
| +/// to the pinsrb instruction.
 | 
| +template <class Machine>
 | 
| +Variable *TargetX86Base<Machine>::copyToReg8(Operand *Src, int32_t RegNum) {
 | 
| +  Type Ty = Src->getType();
 | 
| +  assert(isScalarIntegerType(Ty));
 | 
| +  assert(Ty != IceType_i1);
 | 
| +  Variable *Reg = makeReg(IceType_i8, RegNum);
 | 
| +  Reg->setRegClass(RCX86_IsTrunc8Rcvr);
 | 
| +  if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) {
 | 
| +    Variable *SrcTruncable = makeReg(Ty);
 | 
| +    switch (Ty) {
 | 
| +    case IceType_i64:
 | 
| +      SrcTruncable->setRegClass(RCX86_Is64To8);
 | 
| +      break;
 | 
| +    case IceType_i32:
 | 
| +      SrcTruncable->setRegClass(RCX86_Is32To8);
 | 
| +      break;
 | 
| +    case IceType_i16:
 | 
| +      SrcTruncable->setRegClass(RCX86_Is16To8);
 | 
| +      break;
 | 
| +    default:
 | 
| +      // i8 - just use default register class
 | 
| +      break;
 | 
| +    }
 | 
| +    Variable *SrcRcvr = makeReg(IceType_i8);
 | 
| +    SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr);
 | 
| +    _mov(SrcTruncable, Src);
 | 
| +    _mov(SrcRcvr, SrcTruncable);
 | 
| +    Src = SrcRcvr;
 | 
| +  }
 | 
| +  _mov(Reg, Src);
 | 
| +  return Reg;
 | 
| +}
 | 
| +
 | 
|  /// Helper for legalize() to emit the right code to lower an operand to a
 | 
|  /// register of the appropriate type.
 | 
|  template <class Machine>
 | 
| 
 |