409 files changed, 7415 insertions, 2792 deletions
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 4a6abae8e97..fb9ece2bd20 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1672,9 +1672,9 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, uint64_t V1Size,
   // If both pointers are pointing into the same object and one of them
   // accesses the entire object, then the accesses must overlap in some way.
   if (O1 == O2)
-    if ((V1Size != MemoryLocation::UnknownSize &&
-         isObjectSize(O1, V1Size, DL, TLI)) ||
-        (V2Size != MemoryLocation::UnknownSize &&
+    if (V1Size != MemoryLocation::UnknownSize &&
+        V2Size != MemoryLocation::UnknownSize &&
+        (isObjectSize(O1, V1Size, DL, TLI) ||
          isObjectSize(O2, V2Size, DL, TLI)))
       return AliasCache[Locs] = PartialAlias;
 
diff --git a/lib/Analysis/BlockFrequencyInfo.cpp b/lib/Analysis/BlockFrequencyInfo.cpp
index 5d2170dcf15..41c29589521 100644
--- a/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/lib/Analysis/BlockFrequencyInfo.cpp
@@ -218,6 +218,11 @@ BlockFrequencyInfo::getProfileCountFromFreq(uint64_t Freq) const {
   return BFI->getProfileCountFromFreq(*getFunction(), Freq);
 }
 
+bool BlockFrequencyInfo::isIrrLoopHeader(const BasicBlock *BB) {
+  assert(BFI && "Expected analysis to be available");
+  return BFI->isIrrLoopHeader(BB);
+}
+
 void BlockFrequencyInfo::setBlockFreq(const BasicBlock *BB, uint64_t Freq) {
   assert(BFI && "Expected analysis to be available");
   BFI->setBlockFreq(BB, Freq);
diff --git a/lib/Analysis/BlockFrequencyInfoImpl.cpp b/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 1030407b766..7e323022d9c 100644
--- a/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -271,6 +271,7 @@ void BlockFrequencyInfoImplBase::clear() {
   // Swap with a default-constructed std::vector, since std::vector<>::clear()
   // does not actually clear heap storage.
   std::vector<FrequencyData>().swap(Freqs);
+  IsIrrLoopHeader.clear();
   std::vector<WorkingData>().swap(Working);
   Loops.clear();
 }
@@ -280,8 +281,10 @@ void BlockFrequencyInfoImplBase::clear() {
 /// Releases all memory not used downstream.  In particular, saves Freqs.
 static void cleanup(BlockFrequencyInfoImplBase &BFI) {
   std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs));
+  SparseBitVector<> SavedIsIrrLoopHeader(std::move(BFI.IsIrrLoopHeader));
   BFI.clear();
   BFI.Freqs = std::move(SavedFreqs);
+  BFI.IsIrrLoopHeader = std::move(SavedIsIrrLoopHeader);
 }
 
 bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,
@@ -572,6 +575,13 @@ BlockFrequencyInfoImplBase::getProfileCountFromFreq(const Function &F,
   return BlockCount.getLimitedValue();
 }
 
+bool
+BlockFrequencyInfoImplBase::isIrrLoopHeader(const BlockNode &Node) {
+  if (!Node.isValid())
+    return false;
+  return IsIrrLoopHeader.test(Node.Index);
+}
+
 Scaled64
 BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const {
   if (!Node.isValid())
@@ -819,3 +829,14 @@ void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) {
     DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
   }
 }
+
+void BlockFrequencyInfoImplBase::distributeIrrLoopHeaderMass(Distribution &Dist) {
+  BlockMass LoopMass = BlockMass::getFull();
+  DitheringDistributer D(Dist, LoopMass);
+  for (const Weight &W : Dist.Weights) {
+    BlockMass Taken = D.takeMass(W.Amount);
+    assert(W.Type == Weight::Local && "all weights should be local");
+    Working[W.TargetNode.Index].getMass() = Taken;
+    DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
+  }
+}
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index 19889658b13..e141d6c58b6 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2136,8 +2136,51 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
   if (!Stride)
     return;
 
-  DEBUG(dbgs() << "LAA: Found a strided access that we can version");
+  DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
+                  "versioning:");
   DEBUG(dbgs() << "  Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
+
+  // Avoid adding the "Stride == 1" predicate when we know that 
+  // Stride >= Trip-Count. Such a predicate will effectively optimize a single
+  // or zero iteration loop, as Trip-Count <= Stride == 1.
+  // 
+  // TODO: We are currently not making a very informed decision on when it is
+  // beneficial to apply stride versioning. It might make more sense that the
+  // users of this analysis (such as the vectorizer) will trigger it, based on 
+  // their specific cost considerations; For example, in cases where stride 
+  // versioning does  not help resolving memory accesses/dependences, the
+  // vectorizer should evaluate the cost of the runtime test, and the benefit 
+  // of various possible stride specializations, considering the alternatives 
+  // of using gather/scatters (if available). 
+  
+  const SCEV *StrideExpr = PSE->getSCEV(Stride);
+  const SCEV *BETakenCount = PSE->getBackedgeTakenCount();  
+
+  // Match the types so we can compare the stride and the BETakenCount.
+  // The Stride can be positive/negative, so we sign extend Stride; 
+  // The backdgeTakenCount is non-negative, so we zero extend BETakenCount.
+  const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+  uint64_t StrideTypeSize = DL.getTypeAllocSize(StrideExpr->getType());
+  uint64_t BETypeSize = DL.getTypeAllocSize(BETakenCount->getType());
+  const SCEV *CastedStride = StrideExpr;
+  const SCEV *CastedBECount = BETakenCount;
+  ScalarEvolution *SE = PSE->getSE();
+  if (BETypeSize >= StrideTypeSize)
+    CastedStride = SE->getNoopOrSignExtend(StrideExpr, BETakenCount->getType());
+  else
+    CastedBECount = SE->getZeroExtendExpr(BETakenCount, StrideExpr->getType());
+  const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount);
+  // Since TripCount == BackEdgeTakenCount + 1, checking:
+  // "Stride >= TripCount" is equivalent to checking: 
+  // Stride - BETakenCount > 0
+  if (SE->isKnownPositive(StrideMinusBETaken)) {
+    DEBUG(dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
+                    "Stride==1 predicate will imply that the loop executes "
+                    "at most once.\n");
+    return;
+  }  
+  DEBUG(dbgs() << "LAA: Found a strided access that we can version.");
+
   SymbolicStrides[Ptr] = Stride;
   StrideSet.insert(Stride);
 }
diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
index afd575e7273..82db09ca97b 100644
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -303,7 +303,7 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
       // FIXME: refactor this to use the same code that inliner is using.
       F.isVarArg();
   GlobalValueSummary::GVFlags Flags(F.getLinkage(), NotEligibleForImport,
-                                    /* Live = */ false);
+                                    /* Live = */ false, F.isDSOLocal());
   FunctionSummary::FFlags FunFlags{
       F.hasFnAttribute(Attribute::ReadNone),
       F.hasFnAttribute(Attribute::ReadOnly),
@@ -329,7 +329,7 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V,
   findRefEdges(Index, &V, RefEdges, Visited);
   bool NonRenamableLocal = isNonRenamableLocal(V);
   GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal,
-                                    /* Live = */ false);
+                                    /* Live = */ false, V.isDSOLocal());
   auto GVarSummary =
       llvm::make_unique<GlobalVarSummary>(Flags, RefEdges.takeVector());
   if (NonRenamableLocal)
@@ -342,7 +342,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A,
                     DenseSet<GlobalValue::GUID> &CantBePromoted) {
   bool NonRenamableLocal = isNonRenamableLocal(A);
   GlobalValueSummary::GVFlags Flags(A.getLinkage(), NonRenamableLocal,
-                                    /* Live = */ false);
+                                    /* Live = */ false, A.isDSOLocal());
   auto AS = llvm::make_unique<AliasSummary>(Flags);
   auto *Aliasee = A.getBaseObject();
   auto *AliaseeSummary = Index.getGlobalValueSummary(*Aliasee);
@@ -410,7 +410,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
           assert(GV->isDeclaration() && "Def in module asm already has definition");
           GlobalValueSummary::GVFlags GVFlags(GlobalValue::InternalLinkage,
                                               /* NotEligibleToImport = */ true,
-                                              /* Live = */ true);
+                                              /* Live = */ true,
+                                              /* Local */ GV->isDSOLocal());
           CantBePromoted.insert(GlobalValue::getGUID(Name));
           // Create the appropriate summary type.
           if (Function *F = dyn_cast<Function>(GV)) {
diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp
index 3a3a7ad3955..1e36e314b86 100644
--- a/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -314,17 +314,8 @@ AliasResult TypeBasedAAResult::alias(const MemoryLocation &LocA,
   if (!EnableTBAA)
     return AAResultBase::alias(LocA, LocB);
 
-  // Get the attached MDNodes. If either value lacks a tbaa MDNode, we must
-  // be conservative.
-  const MDNode *AM = LocA.AATags.TBAA;
-  if (!AM)
-    return AAResultBase::alias(LocA, LocB);
-  const MDNode *BM = LocB.AATags.TBAA;
-  if (!BM)
-    return AAResultBase::alias(LocA, LocB);
-
-  // If they may alias, chain to the next AliasAnalysis.
-  if (Aliases(AM, BM))
+  // If accesses may alias, chain to the next AliasAnalysis.
+  if (Aliases(LocA.AATags.TBAA, LocB.AATags.TBAA))
     return AAResultBase::alias(LocA, LocB);
 
   // Otherwise return a definitive result.
@@ -424,25 +415,24 @@ bool MDNode::isTBAAVtableAccess() const {
   return false;
 }
 
+static bool matchAccessTags(const MDNode *A, const MDNode *B,
+                            const MDNode **GenericTag = nullptr);
+
 MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
+  const MDNode *GenericTag;
+  matchAccessTags(A, B, &GenericTag);
+  return const_cast<MDNode*>(GenericTag);
+}
+
+static const MDNode *getLeastCommonType(const MDNode *A, const MDNode *B) {
   if (!A || !B)
     return nullptr;
 
   if (A == B)
     return A;
 
-  // For struct-path aware TBAA, we use the access type of the tag.
-  assert(isStructPathTBAA(A) && isStructPathTBAA(B) &&
-         "Auto upgrade should have taken care of this!");
-  A = cast_or_null<MDNode>(MutableTBAAStructTagNode(A).getAccessType());
-  if (!A)
-    return nullptr;
-  B = cast_or_null<MDNode>(MutableTBAAStructTagNode(B).getAccessType());
-  if (!B)
-    return nullptr;
-
-  SmallSetVector<MDNode *, 4> PathA;
-  MutableTBAANode TA(A);
+  SmallSetVector<const MDNode *, 4> PathA;
+  TBAANode TA(A);
   while (TA.getNode()) {
     if (PathA.count(TA.getNode()))
       report_fatal_error("Cycle found in TBAA metadata.");
@@ -450,8 +440,8 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
     TA = TA.getParent();
   }
 
-  SmallSetVector<MDNode *, 4> PathB;
-  MutableTBAANode TB(B);
+  SmallSetVector<const MDNode *, 4> PathB;
+  TBAANode TB(B);
   while (TB.getNode()) {
     if (PathB.count(TB.getNode()))
       report_fatal_error("Cycle found in TBAA metadata.");
@@ -462,7 +452,7 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
   int IA = PathA.size() - 1;
   int IB = PathB.size() - 1;
 
-  MDNode *Ret = nullptr;
+  const MDNode *Ret = nullptr;
   while (IA >= 0 && IB >= 0) {
     if (PathA[IA] == PathB[IB])
       Ret = PathA[IA];
@@ -472,17 +462,7 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
     --IB;
   }
 
-  // We either did not find a match, or the only common base "type" is
-  // the root node.  In either case, we don't have any useful TBAA
-  // metadata to attach.
-  if (!Ret || Ret->getNumOperands() < 2)
-    return nullptr;
-
-  // We need to convert from a type node to a tag node.
-  Type *Int64 = IntegerType::get(A->getContext(), 64);
-  Metadata *Ops[3] = {Ret, Ret,
-                      ConstantAsMetadata::get(ConstantInt::get(Int64, 0))};
-  return MDNode::get(A->getContext(), Ops);
+  return Ret;
 }
 
 void Instruction::getAAMetadata(AAMDNodes &N, bool Merge) const {
@@ -505,70 +485,96 @@ void Instruction::getAAMetadata(AAMDNodes &N, bool Merge) const {
     N.NoAlias = getMetadata(LLVMContext::MD_noalias);
 }
 
-/// Aliases - Test whether the type represented by A may alias the
-/// type represented by B.
-bool TypeBasedAAResult::Aliases(const MDNode *A, const MDNode *B) const {
-  // Verify that both input nodes are struct-path aware.  Auto-upgrade should
-  // have taken care of this.
-  assert(isStructPathTBAA(A) && "MDNode A is not struct-path aware.");
-  assert(isStructPathTBAA(B) && "MDNode B is not struct-path aware.");
+static bool findAccessType(TBAAStructTagNode BaseTag,
+                           const MDNode *AccessTypeNode,
+                           uint64_t &OffsetInBase) {
+  // Start from the base type, follow the edge with the correct offset in
+  // the type DAG and adjust the offset until we reach the access type or
+  // until we reach a root node.
+  TBAAStructTypeNode BaseType(BaseTag.getBaseType());
+  OffsetInBase = BaseTag.getOffset();
+
+  while (const MDNode *BaseTypeNode = BaseType.getNode()) {
+    if (BaseTypeNode == AccessTypeNode)
+      return true;
 
-  // Keep track of the root node for A and B.
-  TBAAStructTypeNode RootA, RootB;
-  TBAAStructTagNode TagA(A), TagB(B);
+    // Follow the edge with the correct offset, Offset will be adjusted to
+    // be relative to the field type.
+    BaseType = BaseType.getParent(OffsetInBase);
+  }
+  return false;
+}
 
-  // TODO: We need to check if AccessType of TagA encloses AccessType of
-  // TagB to support aggregate AccessType. If yes, return true.
+static const MDNode *createAccessTag(const MDNode *AccessType) {
+  // If there is no access type or the access type is the root node, then
+  // we don't have any useful access tag to return.
+  if (!AccessType || AccessType->getNumOperands() < 2)
+    return nullptr;
 
-  // Start from the base type of A, follow the edge with the correct offset in
-  // the type DAG and adjust the offset until we reach the base type of B or
-  // until we reach the Root node.
-  // Compare the adjusted offset once we have the same base.
+  Type *Int64 = IntegerType::get(AccessType->getContext(), 64);
+  auto *ImmutabilityFlag = ConstantAsMetadata::get(ConstantInt::get(Int64, 0));
+  Metadata *Ops[] = {const_cast<MDNode*>(AccessType),
+                     const_cast<MDNode*>(AccessType), ImmutabilityFlag};
+  return MDNode::get(AccessType->getContext(), Ops);
+}
 
-  // Climb the type DAG from base type of A to see if we reach base type of B.
-  const MDNode *BaseA = TagA.getBaseType();
-  const MDNode *BaseB = TagB.getBaseType();
-  uint64_t OffsetA = TagA.getOffset(), OffsetB = TagB.getOffset();
-  for (TBAAStructTypeNode T(BaseA);;) {
-    if (T.getNode() == BaseB)
-      // Base type of A encloses base type of B, check if the offsets match.
-      return OffsetA == OffsetB;
-
-    RootA = T;
-    // Follow the edge with the correct offset, OffsetA will be adjusted to
-    // be relative to the field type.
-    T = T.getParent(OffsetA);
-    if (!T.getNode())
-      break;
+/// matchTags - Return true if the given couple of accesses are allowed to
+/// overlap. If \arg GenericTag is not null, then on return it points to the
+/// most generic access descriptor for the given two.
+static bool matchAccessTags(const MDNode *A, const MDNode *B,
+                            const MDNode **GenericTag) {
+  if (A == B) {
+    if (GenericTag)
+      *GenericTag = A;
+    return true;
   }
 
-  // Reset OffsetA and climb the type DAG from base type of B to see if we reach
-  // base type of A.
-  OffsetA = TagA.getOffset();
-  for (TBAAStructTypeNode T(BaseB);;) {
-    if (T.getNode() == BaseA)
-      // Base type of B encloses base type of A, check if the offsets match.
-      return OffsetA == OffsetB;
-
-    RootB = T;
-    // Follow the edge with the correct offset, OffsetB will be adjusted to
-    // be relative to the field type.
-    T = T.getParent(OffsetB);
-    if (!T.getNode())
-      break;
+  // Accesses with no TBAA information may alias with any other accesses.
+  if (!A || !B) {
+    if (GenericTag)
+      *GenericTag = nullptr;
+    return true;
   }
 
-  // Neither node is an ancestor of the other.
+  // Verify that both input nodes are struct-path aware.  Auto-upgrade should
+  // have taken care of this.
+  assert(isStructPathTBAA(A) && "Access A is not struct-path aware!");
+  assert(isStructPathTBAA(B) && "Access B is not struct-path aware!");
+
+  TBAAStructTagNode TagA(A), TagB(B);
+  const MDNode *CommonType = getLeastCommonType(TagA.getAccessType(),
+                                                TagB.getAccessType());
+  if (GenericTag)
+    *GenericTag = createAccessTag(CommonType);
 
-  // If they have different roots, they're part of different potentially
-  // unrelated type systems, so we must be conservative.
-  if (RootA.getNode() != RootB.getNode())
+  // TODO: We need to check if AccessType of TagA encloses AccessType of
+  // TagB to support aggregate AccessType. If yes, return true.
+
+  // Climb the type DAG from base type of A to see if we reach base type of B.
+  uint64_t OffsetA;
+  if (findAccessType(TagA, TagB.getBaseType(), OffsetA))
+    return OffsetA == TagB.getOffset();
+
+  // Climb the type DAG from base type of B to see if we reach base type of A.
+  uint64_t OffsetB;
+  if (findAccessType(TagB, TagA.getBaseType(), OffsetB))
+    return OffsetB == TagA.getOffset();
+
+  // If the final access types have different roots, they're part of different
+  // potentially unrelated type systems, so we must be conservative.
+  if (!CommonType)
     return true;
 
   // If they have the same root, then we've proved there's no alias.
   return false;
 }
 
+/// Aliases - Test whether the access represented by tag A may alias the
+/// access represented by tag B.
+bool TypeBasedAAResult::Aliases(const MDNode *A, const MDNode *B) const {
+  return matchAccessTags(A, B);
+}
+
 AnalysisKey TypeBasedAA::Key;
 
 TypeBasedAAResult TypeBasedAA::run(Function &F, FunctionAnalysisManager &AM) {
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 3f53fc517e3..2010858139a 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -2579,9 +2579,7 @@ Intrinsic::ID llvm::getIntrinsicForCallSite(ImmutableCallSite ICS,
   case LibFunc_sqrt:
   case LibFunc_sqrtf:
   case LibFunc_sqrtl:
-    if (ICS->hasNoNaNs())
-      return Intrinsic::sqrt;
-    return Intrinsic::not_intrinsic;
+    return Intrinsic::sqrt;
   }
 
   return Intrinsic::not_intrinsic;
@@ -4140,7 +4138,8 @@ static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
     // Is the sign bit set?
     // (X <s 0) ? X : MAXVAL ==> (X >u MAXVAL) ? X : MAXVAL ==> UMAX
     // (X <s 0) ? MAXVAL : X ==> (X >u MAXVAL) ? MAXVAL : X ==> UMIN
-    if (Pred == CmpInst::ICMP_SLT && *C1 == 0 && C2->isMaxSignedValue())
+    if (Pred == CmpInst::ICMP_SLT && C1->isNullValue() &&
+        C2->isMaxSignedValue())
       return {CmpLHS == TrueVal ? SPF_UMAX : SPF_UMIN, SPNB_NA, false};
 
     // Is the sign bit clear?
@@ -4272,13 +4271,15 @@ static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
 
       // ABS(X) ==> (X >s 0) ? X : -X and (X >s -1) ? X : -X
       // NABS(X) ==> (X >s 0) ? -X : X and (X >s -1) ? -X : X
-      if (Pred == ICmpInst::ICMP_SGT && (*C1 == 0 || C1->isAllOnesValue())) {
+      if (Pred == ICmpInst::ICMP_SGT &&
+          (C1->isNullValue() || C1->isAllOnesValue())) {
         return {(CmpLHS == TrueVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false};
       }
 
       // ABS(X) ==> (X <s 0) ? -X : X and (X <s 1) ? -X : X
       // NABS(X) ==> (X <s 0) ? X : -X and (X <s 1) ? X : -X
-      if (Pred == ICmpInst::ICMP_SLT && (*C1 == 0 || *C1 == 1)) {
+      if (Pred == ICmpInst::ICMP_SLT &&
+          (C1->isNullValue() || C1->isOneValue())) {
         return {(CmpLHS == FalseVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false};
       }
     }
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 50b391fdf73..b8b56d79c82 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -552,6 +552,8 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(nsz);
   KEYWORD(arcp);
   KEYWORD(contract);
+  KEYWORD(reassoc);
+  KEYWORD(afn);
   KEYWORD(fast);
   KEYWORD(nuw);
   KEYWORD(nsw);
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index dcc3f22e03b..94e4c1ae96d 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -193,7 +193,7 @@ namespace llvm {
       FastMathFlags FMF;
       while (true)
         switch (Lex.getKind()) {
-        case lltok::kw_fast: FMF.setUnsafeAlgebra();   Lex.Lex(); continue;
+        case lltok::kw_fast: FMF.setFast();            Lex.Lex(); continue;
         case lltok::kw_nnan: FMF.setNoNaNs();          Lex.Lex(); continue;
         case lltok::kw_ninf: FMF.setNoInfs();          Lex.Lex(); continue;
         case lltok::kw_nsz:  FMF.setNoSignedZeros();   Lex.Lex(); continue;
@@ -202,6 +202,8 @@ namespace llvm {
           FMF.setAllowContract(true);
           Lex.Lex();
           continue;
+        case lltok::kw_reassoc: FMF.setAllowReassoc(); Lex.Lex(); continue;
+        case lltok::kw_afn:     FMF.setApproxFunc();   Lex.Lex(); continue;
         default: return FMF;
         }
       return FMF;
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index db0de6c0d5a..0c5cf6b5d45 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -102,6 +102,8 @@ enum Kind {
   kw_nsz,
   kw_arcp,
   kw_contract,
+  kw_reassoc,
+  kw_afn,
   kw_fast,
   kw_nuw,
   kw_nsw,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index c2272260f44..3e0a39c099b 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -889,7 +889,9 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags,
   // to work correctly on earlier versions, we must conservatively treat all
   // values as live.
   bool Live = (RawFlags & 0x2) || Version < 3;
-  return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live);
+  bool Local = (RawFlags & 0x4);
+
+  return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live, Local);
 }
 
 static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) {
@@ -1044,8 +1046,8 @@ static Comdat::SelectionKind getDecodedComdatSelectionKind(unsigned Val) {
 
 static FastMathFlags getDecodedFastMathFlags(unsigned Val) {
   FastMathFlags FMF;
-  if (0 != (Val & FastMathFlags::UnsafeAlgebra))
-    FMF.setUnsafeAlgebra();
+  if (0 != (Val & FastMathFlags::AllowReassoc))
+    FMF.setAllowReassoc();
   if (0 != (Val & FastMathFlags::NoNaNs))
     FMF.setNoNaNs();
   if (0 != (Val & FastMathFlags::NoInfs))
@@ -1056,6 +1058,8 @@ static FastMathFlags getDecodedFastMathFlags(unsigned Val) {
     FMF.setAllowReciprocal();
   if (0 != (Val & FastMathFlags::AllowContract))
     FMF.setAllowContract(true);
+  if (0 != (Val & FastMathFlags::ApproxFunc))
+    FMF.setApproxFunc();
   return FMF;
 }
 
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 1e491aa066e..03a77c9734e 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -955,6 +955,8 @@ static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) {
 
   RawFlags |= Flags.NotEligibleToImport; // bool
   RawFlags |= (Flags.Live << 1);
+  RawFlags |= (Flags.DSOLocal << 2);
+
   // Linkage don't need to be remapped at that time for the summary. Any future
   // change to the getEncodedLinkage() function will need to be taken into
   // account here as well.
@@ -1319,8 +1321,8 @@ static uint64_t getOptimizationFlags(const Value *V) {
     if (PEO->isExact())
       Flags |= 1 << bitc::PEO_EXACT;
   } else if (const auto *FPMO = dyn_cast<FPMathOperator>(V)) {
-    if (FPMO->hasUnsafeAlgebra())
-      Flags |= FastMathFlags::UnsafeAlgebra;
+    if (FPMO->hasAllowReassoc())
+      Flags |= FastMathFlags::AllowReassoc;
     if (FPMO->hasNoNaNs())
       Flags |= FastMathFlags::NoNaNs;
     if (FPMO->hasNoInfs())
@@ -1331,6 +1333,8 @@ static uint64_t getOptimizationFlags(const Value *V) {
       Flags |= FastMathFlags::AllowReciprocal;
     if (FPMO->hasAllowContract())
       Flags |= FastMathFlags::AllowContract;
+    if (FPMO->hasApproxFunc())
+      Flags |= FastMathFlags::ApproxFunc;
   }
 
   return Flags;
diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h
index 730187087dc..011356c3260 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/lib/Bitcode/Writer/ValueEnumerator.h
@@ -18,6 +18,8 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/UniqueVector.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
 #include "llvm/IR/UseListOrder.h"
 #include <cassert>
 #include <cstdint>
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index d7f91fc1ce3..1dea746a6ac 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -28,12 +28,12 @@
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <cassert>
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 876cca4bc7a..9642368a047 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -24,7 +25,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 8b1376ab363..973816d5635 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -29,7 +29,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index a35fcdaaf9a..4ebc7176943 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -51,6 +51,8 @@
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Comdat.h"
 #include "llvm/IR/Constant.h"
@@ -100,8 +102,6 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index eae79ad101d..5250f1b1787 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InlineAsm.h"
@@ -32,7 +33,6 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 67bab8c7684..5aa3f4ae103 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -68,7 +68,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/SMLoc.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index dd7f7931b06..1a6cb967992 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -31,7 +31,7 @@
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 06b5b06c41b..603d0f7f470 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -36,7 +36,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index 6e4625ba411..167ca13c19c 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -15,6 +15,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DIE.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/Support/Allocator.h"
 #include <memory>
 #include <utility>
@@ -27,7 +28,6 @@ class DwarfCompileUnit;
 class DwarfUnit;
 class LexicalScope;
 class MCSection;
-class MDNode;
 
 class DwarfFile {
   // Target of Dwarf emission, used for sizing of abbreviations.
diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp
index 5d485f21357..35ce1fec385 100644
--- a/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -33,7 +33,7 @@
 #include "llvm/MC/MCWin64EH.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 40cb0c0cdf1..cd6056b674c 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -39,6 +39,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
@@ -52,7 +53,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -296,6 +296,11 @@ static unsigned HashEndOfMBB(const MachineBasicBlock &MBB) {
   return HashMachineInstr(*I);
 }
 
+///  Whether MI should be counted as an instruction when calculating common tail.
+static bool countsAsInstruction(const MachineInstr &MI) {
+  return !(MI.isDebugValue() || MI.isCFIInstruction());
+}
+
 /// ComputeCommonTailLength - Given two machine basic blocks, compute the number
 /// of instructions they actually have in common together at their end.  Return
 /// iterators for the first shared instruction in each block.
@@ -310,26 +315,27 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
   while (I1 != MBB1->begin() && I2 != MBB2->begin()) {
     --I1; --I2;
     // Skip debugging pseudos; necessary to avoid changing the code.
-    while (I1->isDebugValue()) {
+    while (!countsAsInstruction(*I1)) {
       if (I1==MBB1->begin()) {
-        while (I2->isDebugValue()) {
-          if (I2==MBB2->begin())
+        while (!countsAsInstruction(*I2)) {
+          if (I2==MBB2->begin()) {
             // I1==DBG at begin; I2==DBG at begin
-            return TailLen;
+            goto SkipTopCFIAndReturn;
+          }
           --I2;
         }
         ++I2;
         // I1==DBG at begin; I2==non-DBG, or first of DBGs not at begin
-        return TailLen;
+        goto SkipTopCFIAndReturn;
       }
       --I1;
     }
     // I1==first (untested) non-DBG preceding known match
-    while (I2->isDebugValue()) {
+    while (!countsAsInstruction(*I2)) {
       if (I2==MBB2->begin()) {
         ++I1;
         // I1==non-DBG, or first of DBGs not at begin; I2==DBG at begin
-        return TailLen;
+        goto SkipTopCFIAndReturn;
       }
       --I2;
     }
@@ -368,6 +374,37 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
     }
     ++I1;
   }
+
+SkipTopCFIAndReturn:
+  // Ensure that I1 and I2 do not point to a CFI_INSTRUCTION. This can happen if
+  // I1 and I2 are non-identical when compared and then one or both of them ends
+  // up pointing to a CFI instruction after being incremented. For example:
+  /*
+    BB1:
+    ...
+    INSTRUCTION_A
+    ADD32ri8  <- last common instruction
+    ...
+    BB2:
+    ...
+    INSTRUCTION_B
+    CFI_INSTRUCTION
+    ADD32ri8  <- last common instruction
+    ...
+  */
+  // When INSTRUCTION_A and INSTRUCTION_B are compared as not equal, after
+  // incrementing the iterators, I1 will point to ADD, however I2 will point to
+  // the CFI instruction. Later on, this leads to BB2 being 'hacked off' at the
+  // wrong place (in ReplaceTailWithBranchTo()) which results in losing this CFI
+  // instruction.
+  while (I1 != MBB1->end() && I1->isCFIInstruction()) {
+    ++I1;
+  }
+
+  while (I2 != MBB2->end() && I2->isCFIInstruction()) {
+    ++I2;
+  }
+
   return TailLen;
 }
 
@@ -454,7 +491,7 @@ static unsigned EstimateRuntime(MachineBasicBlock::iterator I,
                                 MachineBasicBlock::iterator E) {
   unsigned Time = 0;
   for (; I != E; ++I) {
-    if (I->isDebugValue())
+    if (!countsAsInstruction(*I))
       continue;
     if (I->isCall())
       Time += 10;
@@ -814,12 +851,12 @@ mergeOperations(MachineBasicBlock::iterator MBBIStartPos,
     assert(MBBI != MBBIE && "Reached BB end within common tail length!");
     (void)MBBIE;
 
-    if (MBBI->isDebugValue()) {
+    if (!countsAsInstruction(*MBBI)) {
       ++MBBI;
       continue;
     }
 
-    while ((MBBICommon != MBBIECommon) && MBBICommon->isDebugValue())
+    while ((MBBICommon != MBBIECommon) && !countsAsInstruction(*MBBICommon))
       ++MBBICommon;
 
     assert(MBBICommon != MBBIECommon &&
@@ -859,7 +896,7 @@ void BranchFolder::mergeCommonTails(unsigned commonTailIndex) {
   }
 
   for (auto &MI : *MBB) {
-    if (MI.isDebugValue())
+    if (!countsAsInstruction(MI))
       continue;
     DebugLoc DL = MI.getDebugLoc();
     for (unsigned int i = 0 ; i < NextCommonInsts.size() ; i++) {
@@ -869,7 +906,7 @@ void BranchFolder::mergeCommonTails(unsigned commonTailIndex) {
       auto &Pos = NextCommonInsts[i];
       assert(Pos != SameTails[i].getBlock()->end() &&
           "Reached BB end within common tail");
-      while (Pos->isDebugValue()) {
+      while (!countsAsInstruction(*Pos)) {
         ++Pos;
         assert(Pos != SameTails[i].getBlock()->end() &&
             "Reached BB end within common tail");
diff --git a/lib/CodeGen/BranchRelaxation.cpp b/lib/CodeGen/BranchRelaxation.cpp
index 2d21fbeea39..73b399e4444 100644
--- a/lib/CodeGen/BranchRelaxation.cpp
+++ b/lib/CodeGen/BranchRelaxation.cpp
@@ -15,6 +15,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
@@ -22,7 +23,6 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <cassert>
diff --git a/lib/CodeGen/CFIInstrInserter.cpp b/lib/CodeGen/CFIInstrInserter.cpp
new file mode 100644
index 00000000000..5464ee443e0
--- /dev/null
+++ b/lib/CodeGen/CFIInstrInserter.cpp
@@ -0,0 +1,319 @@
+//===------ CFIInstrInserter.cpp - Insert additional CFI instructions -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass verifies incoming and outgoing CFA information of basic
+/// blocks. CFA information is information about offset and register set by CFI
+/// directives, valid at the start and end of a basic block. This pass checks
+/// that outgoing information of predecessors matches incoming information of
+/// their successors. Then it checks if blocks have correct CFA calculation rule
+/// set and inserts additional CFI instruction at their beginnings if they
+/// don't. CFI instructions are inserted if basic blocks have incorrect offset
+/// or register set by previous blocks, as a result of a non-linear layout of
+/// blocks in a function.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+namespace {
+class CFIInstrInserter : public MachineFunctionPass {
+ public:
+  static char ID;
+
+  CFIInstrInserter() : MachineFunctionPass(ID) {
+    initializeCFIInstrInserterPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+
+    if (!MF.getMMI().hasDebugInfo() &&
+        !MF.getFunction()->needsUnwindTableEntry())
+      return false;
+
+    MBBVector.resize(MF.getNumBlockIDs());
+    calculateCFAInfo(MF);
+#ifndef NDEBUG
+    unsigned ErrorNum = verify(MF);
+    if (ErrorNum)
+      report_fatal_error("Found " + Twine(ErrorNum) +
+                         " in/out CFI information errors.");
+#endif
+    bool insertedCFI = insertCFIInstrs(MF);
+    MBBVector.clear();
+    return insertedCFI;
+  }
+
+ private:
+  struct MBBCFAInfo {
+    MachineBasicBlock *MBB;
+    /// Value of cfa offset valid at basic block entry.
+    int IncomingCFAOffset = -1;
+    /// Value of cfa offset valid at basic block exit.
+    int OutgoingCFAOffset = -1;
+    /// Value of cfa register valid at basic block entry.
+    unsigned IncomingCFARegister = 0;
+    /// Value of cfa register valid at basic block exit.
+    unsigned OutgoingCFARegister = 0;
+    /// If in/out cfa offset and register values for this block have already
+    /// been set or not.
+    bool Processed = false;
+  };
+
+  /// Contains cfa offset and register values valid at entry and exit of basic
+  /// blocks.
+  SmallVector<struct MBBCFAInfo, 4> MBBVector;
+
+  /// Calculate cfa offset and register values valid at entry and exit for all
+  /// basic blocks in a function.
+  void calculateCFAInfo(MachineFunction &MF);
+  /// Calculate cfa offset and register values valid at basic block exit by
+  /// checking the block for CFI instructions. Block's incoming CFA info remains
+  /// the same.
+  void calculateOutgoingCFAInfo(struct MBBCFAInfo &MBBInfo);
+  /// Update in/out cfa offset and register values for successors of the basic
+  /// block.
+  void updateSuccCFAInfo(struct MBBCFAInfo &MBBInfo);
+
+  /// Check if incoming CFA information of a basic block matches outgoing CFA
+  /// information of the previous block. If it doesn't, insert CFI instruction
+  /// at the beginning of the block that corrects the CFA calculation rule for
+  /// that block.
+  bool insertCFIInstrs(MachineFunction &MF);
+  /// Return the cfa offset value that should be set at the beginning of a MBB
+  /// if needed. The negated value is needed when creating CFI instructions that
+  /// set absolute offset.
+  int getCorrectCFAOffset(MachineBasicBlock *MBB) {
+    return -MBBVector[MBB->getNumber()].IncomingCFAOffset;
+  }
+
+  void report(const char *msg, MachineBasicBlock &MBB);
+  /// Go through each MBB in a function and check that outgoing offset and
+  /// register of its predecessors match incoming offset and register of that
+  /// MBB, as well as that incoming offset and register of its successors match
+  /// outgoing offset and register of the MBB.
+  unsigned verify(MachineFunction &MF);
+};
+}
+
+char CFIInstrInserter::ID = 0;
+INITIALIZE_PASS(CFIInstrInserter, "cfi-instr-inserter",
+                "Check CFA info and insert CFI instructions if needed", false,
+                false)
+FunctionPass *llvm::createCFIInstrInserter() { return new CFIInstrInserter(); }
+
+void CFIInstrInserter::calculateCFAInfo(MachineFunction &MF) {
+  // Initial CFA offset value i.e. the one valid at the beginning of the
+  // function.
+  int InitialOffset =
+      MF.getSubtarget().getFrameLowering()->getInitialCFAOffset(MF);
+  // Initial CFA register value i.e. the one valid at the beginning of the
+  // function.
+  unsigned InitialRegister =
+      MF.getSubtarget().getFrameLowering()->getInitialCFARegister(MF);
+
+  // Initialize MBBMap.
+  for (MachineBasicBlock &MBB : MF) {
+    struct MBBCFAInfo MBBInfo;
+    MBBInfo.MBB = &MBB;
+    MBBInfo.IncomingCFAOffset = InitialOffset;
+    MBBInfo.OutgoingCFAOffset = InitialOffset;
+    MBBInfo.IncomingCFARegister = InitialRegister;
+    MBBInfo.OutgoingCFARegister = InitialRegister;
+    MBBVector[MBB.getNumber()] = MBBInfo;
+  }
+
+  // Set in/out cfa info for all blocks in the function. This traversal is based
+  // on the assumption that the first block in the function is the entry block
+  // i.e. that it has initial cfa offset and register values as incoming CFA
+  // information.
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBBVector[MBB.getNumber()].Processed) continue;
+    calculateOutgoingCFAInfo(MBBVector[MBB.getNumber()]);
+    updateSuccCFAInfo(MBBVector[MBB.getNumber()]);
+  }
+}
+
+void CFIInstrInserter::calculateOutgoingCFAInfo(struct MBBCFAInfo &MBBInfo) {
+  // Outgoing cfa offset set by the block.
+  int SetOffset = MBBInfo.IncomingCFAOffset;
+  // Outgoing cfa register set by the block.
+  unsigned SetRegister = MBBInfo.IncomingCFARegister;
+  const std::vector<MCCFIInstruction> &Instrs =
+      MBBInfo.MBB->getParent()->getFrameInstructions();
+
+  // Determine cfa offset and register set by the block.
+  for (MachineInstr &MI :
+       make_range(MBBInfo.MBB->instr_begin(), MBBInfo.MBB->instr_end())) {
+    if (MI.isCFIInstruction()) {
+      unsigned CFIIndex = MI.getOperand(0).getCFIIndex();
+      const MCCFIInstruction &CFI = Instrs[CFIIndex];
+      if (CFI.getOperation() == MCCFIInstruction::OpDefCfaRegister) {
+        SetRegister = CFI.getRegister();
+      } else if (CFI.getOperation() == MCCFIInstruction::OpDefCfaOffset) {
+        SetOffset = CFI.getOffset();
+      } else if (CFI.getOperation() == MCCFIInstruction::OpAdjustCfaOffset) {
+        SetOffset += CFI.getOffset();
+      } else if (CFI.getOperation() == MCCFIInstruction::OpDefCfa) {
+        SetRegister = CFI.getRegister();
+        SetOffset = CFI.getOffset();
+      }
+    }
+  }
+
+  MBBInfo.Processed = true;
+
+  // Update outgoing CFA info.
+  MBBInfo.OutgoingCFAOffset = SetOffset;
+  MBBInfo.OutgoingCFARegister = SetRegister;
+}
+
+void CFIInstrInserter::updateSuccCFAInfo(struct MBBCFAInfo &MBBInfo) {
+
+  for (MachineBasicBlock *Succ : MBBInfo.MBB->successors()) {
+    struct MBBCFAInfo &SuccInfo = MBBVector[Succ->getNumber()];
+    if (SuccInfo.Processed) continue;
+    SuccInfo.IncomingCFAOffset = MBBInfo.OutgoingCFAOffset;
+    SuccInfo.IncomingCFARegister = MBBInfo.OutgoingCFARegister;
+    calculateOutgoingCFAInfo(SuccInfo);
+    updateSuccCFAInfo(SuccInfo);
+  }
+}
+
+bool CFIInstrInserter::insertCFIInstrs(MachineFunction &MF) {
+
+  const struct MBBCFAInfo *PrevMBBInfo = &MBBVector[MF.front().getNumber()];
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  bool InsertedCFIInstr = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    // Skip the first MBB in a function
+    if (MBB.getNumber() == MF.front().getNumber()) continue;
+
+    const struct MBBCFAInfo& MBBInfo = MBBVector[MBB.getNumber()];
+    auto MBBI = MBBInfo.MBB->begin();
+    DebugLoc DL = MBBInfo.MBB->findDebugLoc(MBBI);
+
+    if (PrevMBBInfo->OutgoingCFAOffset != MBBInfo.IncomingCFAOffset) {
+      // If both outgoing offset and register of a previous block don't match
+      // incoming offset and register of this block, add a def_cfa instruction
+      // with the correct offset and register for this block.
+      if (PrevMBBInfo->OutgoingCFARegister != MBBInfo.IncomingCFARegister) {
+        unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+            nullptr, MBBInfo.IncomingCFARegister, getCorrectCFAOffset(&MBB)));
+        BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+        // If outgoing offset of a previous block doesn't match incoming offset
+        // of this block, add a def_cfa_offset instruction with the correct
+        // offset for this block.
+      } else {
+        unsigned CFIIndex =
+            MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(
+                nullptr, getCorrectCFAOffset(&MBB)));
+        BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+      }
+      InsertedCFIInstr = true;
+      // If outgoing register of a previous block doesn't match incoming
+      // register of this block, add a def_cfa_register instruction with the
+      // correct register for this block.
+    } else if (PrevMBBInfo->OutgoingCFARegister != MBBInfo.IncomingCFARegister) {
+      unsigned CFIIndex =
+          MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+              nullptr, MBBInfo.IncomingCFARegister));
+      BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+      InsertedCFIInstr = true;
+    }
+    PrevMBBInfo = &MBBInfo;
+  }
+  return InsertedCFIInstr;
+}
+
+void CFIInstrInserter::report(const char *msg, MachineBasicBlock &MBB) {
+  errs() << '\n';
+  errs() << "*** " << msg << " ***\n"
+         << "- function:    " << MBB.getParent()->getName() << "\n";
+  errs() << "- basic block: BB#" << MBB.getNumber() << ' ' << MBB.getName()
+         << " (" << (const void *)&MBB << ')';
+  errs() << '\n';
+}
+
+unsigned CFIInstrInserter::verify(MachineFunction &MF) {
+  unsigned ErrorNum = 0;
+  for (MachineBasicBlock &CurrMBB : MF) {
+    const struct MBBCFAInfo& CurrMBBInfo = MBBVector[CurrMBB.getNumber()];
+    for (MachineBasicBlock *Pred : CurrMBB.predecessors()) {
+      const struct MBBCFAInfo& PredMBBInfo = MBBVector[Pred->getNumber()];
+      // Check that outgoing offset values of predecessors match the incoming
+      // offset value of CurrMBB
+      if (PredMBBInfo.OutgoingCFAOffset != CurrMBBInfo.IncomingCFAOffset) {
+        report("The outgoing offset of a predecessor is inconsistent.",
+               CurrMBB);
+        errs() << "Predecessor BB#" << Pred->getNumber()
+               << " has outgoing offset (" << PredMBBInfo.OutgoingCFAOffset
+               << "), while BB#" << CurrMBB.getNumber()
+               << " has incoming offset (" << CurrMBBInfo.IncomingCFAOffset
+               << ").\n";
+        ErrorNum++;
+      }
+      // Check that outgoing register values of predecessors match the incoming
+      // register value of CurrMBB
+      if (PredMBBInfo.OutgoingCFARegister != CurrMBBInfo.IncomingCFARegister) {
+        report("The outgoing register of a predecessor is inconsistent.",
+               CurrMBB);
+        errs() << "Predecessor BB#" << Pred->getNumber()
+               << " has outgoing register (" << PredMBBInfo.OutgoingCFARegister
+               << "), while BB#" << CurrMBB.getNumber()
+               << " has incoming register (" << CurrMBBInfo.IncomingCFARegister
+               << ").\n";
+        ErrorNum++;
+      }
+    }
+
+    for (MachineBasicBlock *Succ : CurrMBB.successors()) {
+      const struct MBBCFAInfo& SuccMBBInfo = MBBVector[Succ->getNumber()];
+      // Check that incoming offset values of successors match the outgoing
+      // offset value of CurrMBB
+      if (SuccMBBInfo.IncomingCFAOffset != CurrMBBInfo.OutgoingCFAOffset) {
+        report("The incoming offset of a successor is inconsistent.", CurrMBB);
+        errs() << "Successor BB#" << Succ->getNumber()
+               << " has incoming offset (" << SuccMBBInfo.IncomingCFAOffset
+               << "), while BB#" << CurrMBB.getNumber()
+               << " has outgoing offset (" << CurrMBBInfo.OutgoingCFAOffset
+               << ").\n";
+        ErrorNum++;
+      }
+      // Check that incoming register values of successors match the outgoing
+      // register value of CurrMBB
+      if (SuccMBBInfo.IncomingCFARegister != CurrMBBInfo.OutgoingCFARegister) {
+        report("The incoming register of a successor is inconsistent.",
+               CurrMBB);
+        errs() << "Successor BB#" << Succ->getNumber()
+               << " has incoming register (" << SuccMBBInfo.IncomingCFARegister
+               << "), while BB#" << CurrMBB.getNumber()
+               << " has outgoing register (" << CurrMBBInfo.OutgoingCFARegister
+               << ").\n";
+        ErrorNum++;
+      }
+    }
+  }
+  return ErrorNum;
+}
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 7ec7fda4e44..1ae5aa80bf9 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -9,6 +9,7 @@ add_llvm_library(LLVMCodeGen
   BuiltinGCs.cpp
   CalcSpillWeights.cpp
   CallingConvLower.cpp
+  CFIInstrInserter.cpp
   CodeGen.cpp
   CodeGenPrepare.cpp
   CountingFunctionInserter.cpp
@@ -21,6 +22,7 @@ add_llvm_library(LLVMCodeGen
   EdgeBundles.cpp
   ExecutionDepsFix.cpp
   ExpandISelPseudos.cpp
+  ExpandMemCmp.cpp
   ExpandPostRAPseudos.cpp
   ExpandReductions.cpp
   FaultMaps.cpp
@@ -113,6 +115,7 @@ add_llvm_library(LLVMCodeGen
   RegisterPressure.cpp
   RegisterScavenging.cpp
   RenameIndependentSubregs.cpp
+  MIRCanonicalizerPass.cpp
   RegisterUsageInfo.cpp
   RegUsageInfoCollector.cpp
   RegUsageInfoPropagate.cpp
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index 588f1791ce3..d4ac5fd040c 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -16,10 +16,10 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <cassert>
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index f4ccb4889d3..9d10d1b75f5 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -23,6 +23,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeAtomicExpandPass(Registry);
   initializeBranchFolderPassPass(Registry);
   initializeBranchRelaxationPass(Registry);
+  initializeCFIInstrInserterPass(Registry);
   initializeCodeGenPreparePass(Registry);
   initializeCountingFunctionInserterPass(Registry);
   initializeDeadMachineInstructionElimPass(Registry);
@@ -30,6 +31,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeDwarfEHPreparePass(Registry);
   initializeEarlyIfConverterPass(Registry);
   initializeExpandISelPseudosPass(Registry);
+  initializeExpandMemCmpPassPass(Registry);
   initializeExpandPostRAPass(Registry);
   initializeFEntryInserterPass(Registry);
   initializeFinalizeMachineBundlesPass(Registry);
@@ -99,6 +101,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeVirtRegRewriterPass(Registry);
   initializeWinEHPreparePass(Registry);
   initializeXRayInstrumentationPass(Registry);
+  initializeMIRCanonicalizerPass(Registry);
 }
 
 void LLVMInitializeCodeGen(LLVMPassRegistryRef R) {
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 51f2a320b29..d6633a508f5 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -113,6 +113,12 @@ STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses "
                        "of sunken Casts");
 STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
                           "computations were sunk");
+STATISTIC(NumMemoryInstsPhiCreated,
+          "Number of phis created when address "
+          "computations were sunk to memory instructions");
+STATISTIC(NumMemoryInstsSelectCreated,
+          "Number of select created when address "
+          "computations were sunk to memory instructions");
 STATISTIC(NumExtsMoved,  "Number of [s|z]ext instructions combined with loads");
 STATISTIC(NumExtUses,    "Number of uses of [s|z]ext instructions optimized");
 STATISTIC(NumAndsAdded,
@@ -123,12 +129,6 @@ STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
 STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
 
-STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
-STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
-STATISTIC(NumMemCmpGreaterThanMax,
-          "Number of memcmp calls with size greater than max size");
-STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
-
 static cl::opt<bool> DisableBranchOpts(
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
   cl::desc("Disable branch optimizations in CodeGenPrepare"));
@@ -189,10 +189,18 @@ EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden,
     cl::desc("Enable merging of redundant sexts when one is dominating"
     " the other."), cl::init(true));
 
-static cl::opt<unsigned> MemCmpNumLoadsPerBlock(
-    "memcmp-num-loads-per-block", cl::Hidden, cl::init(1),
-    cl::desc("The number of loads per basic block for inline expansion of "
-             "memcmp that is only being compared against zero."));
+static cl::opt<bool> DisableComplexAddrModes(
+    "disable-complex-addr-modes", cl::Hidden, cl::init(true),
+    cl::desc("Disables combining addressing modes with different parts "
+             "in optimizeMemoryInst."));
+
+static cl::opt<bool>
+AddrSinkNewPhis("addr-sink-new-phis", cl::Hidden, cl::init(false),
+                cl::desc("Allow creation of Phis in Address sinking."));
+
+static cl::opt<bool>
+AddrSinkNewSelects("addr-sink-new-select", cl::Hidden, cl::init(false),
+                   cl::desc("Allow creation of selects in Address sinking."));
 
 namespace {
 
@@ -1182,6 +1190,7 @@ static bool SinkCast(CastInst *CI) {
 
   // If we removed all uses, nuke the cast.
   if (CI->use_empty()) {
+    salvageDebugInfo(*CI);
     CI->eraseFromParent();
     MadeChange = true;
   }
@@ -1697,699 +1706,6 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
   return true;
 }
 
-namespace {
-
-// This class provides helper functions to expand a memcmp library call into an
-// inline expansion.
-class MemCmpExpansion {
-  struct ResultBlock {
-    BasicBlock *BB = nullptr;
-    PHINode *PhiSrc1 = nullptr;
-    PHINode *PhiSrc2 = nullptr;
-
-    ResultBlock() = default;
-  };
-
-  CallInst *const CI;
-  ResultBlock ResBlock;
-  const uint64_t Size;
-  unsigned MaxLoadSize;
-  uint64_t NumLoadsNonOneByte;
-  const uint64_t NumLoadsPerBlock;
-  std::vector<BasicBlock *> LoadCmpBlocks;
-  BasicBlock *EndBlock;
-  PHINode *PhiRes;
-  const bool IsUsedForZeroCmp;
-  const DataLayout &DL;
-  IRBuilder<> Builder;
-  // Represents the decomposition in blocks of the expansion. For example,
-  // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and
-  // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}.
-  // TODO(courbet): Involve the target more in this computation. On X86, 7
-  // bytes can be done more efficiently with two overlaping 4-byte loads than
-  // covering the interval with [{4, 0},{2, 4},{1, 6}}.
-  struct LoadEntry {
-    LoadEntry(unsigned LoadSize, uint64_t Offset)
-        : LoadSize(LoadSize), Offset(Offset) {
-      assert(Offset % LoadSize == 0 && "invalid load entry");
-    }
-
-    uint64_t getGEPIndex() const { return Offset / LoadSize; }
-
-    // The size of the load for this block, in bytes.
-    const unsigned LoadSize;
-    // The offset of this load WRT the base pointer, in bytes.
-    const uint64_t Offset;
-  };
-  SmallVector<LoadEntry, 8> LoadSequence;
-
-  void createLoadCmpBlocks();
-  void createResultBlock();
-  void setupResultBlockPHINodes();
-  void setupEndBlockPHINodes();
-  Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex);
-  void emitLoadCompareBlock(unsigned BlockIndex);
-  void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
-                                         unsigned &LoadIndex);
-  void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex);
-  void emitMemCmpResultBlock();
-  Value *getMemCmpExpansionZeroCase();
-  Value *getMemCmpEqZeroOneBlock();
-  Value *getMemCmpOneBlock();
-
- public:
-  MemCmpExpansion(CallInst *CI, uint64_t Size,
-                  const TargetTransformInfo::MemCmpExpansionOptions &Options,
-                  unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
-                  unsigned NumLoadsPerBlock, const DataLayout &DL);
-
-  unsigned getNumBlocks();
-  uint64_t getNumLoads() const { return LoadSequence.size(); }
-
-  Value *getMemCmpExpansion();
-};
-
-} // end anonymous namespace
-
-// Initialize the basic block structure required for expansion of memcmp call
-// with given maximum load size and memcmp size parameter.
-// This structure includes:
-// 1. A list of load compare blocks - LoadCmpBlocks.
-// 2. An EndBlock, split from original instruction point, which is the block to
-// return from.
-// 3. ResultBlock, block to branch to for early exit when a
-// LoadCmpBlock finds a difference.
-MemCmpExpansion::MemCmpExpansion(
-    CallInst *const CI, uint64_t Size,
-    const TargetTransformInfo::MemCmpExpansionOptions &Options,
-    const unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
-    const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout)
-    : CI(CI),
-      Size(Size),
-      MaxLoadSize(0),
-      NumLoadsNonOneByte(0),
-      NumLoadsPerBlock(NumLoadsPerBlock),
-      IsUsedForZeroCmp(IsUsedForZeroCmp),
-      DL(TheDataLayout),
-      Builder(CI) {
-  assert(Size > 0 && "zero blocks");
-  // Scale the max size down if the target can load more bytes than we need.
-  size_t LoadSizeIndex = 0;
-  while (LoadSizeIndex < Options.LoadSizes.size() &&
-         Options.LoadSizes[LoadSizeIndex] > Size) {
-    ++LoadSizeIndex;
-  }
-  this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex];
-  // Compute the decomposition.
-  uint64_t CurSize = Size;
-  uint64_t Offset = 0;
-  while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) {
-    const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex];
-    assert(LoadSize > 0 && "zero load size");
-    const uint64_t NumLoadsForThisSize = CurSize / LoadSize;
-    if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) {
-      // Do not expand if the total number of loads is larger than what the
-      // target allows. Note that it's important that we exit before completing
-      // the expansion to avoid using a ton of memory to store the expansion for
-      // large sizes.
-      LoadSequence.clear();
-      return;
-    }
-    if (NumLoadsForThisSize > 0) {
-      for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) {
-        LoadSequence.push_back({LoadSize, Offset});
-        Offset += LoadSize;
-      }
-      if (LoadSize > 1) {
-        ++NumLoadsNonOneByte;
-      }
-      CurSize = CurSize % LoadSize;
-    }
-    ++LoadSizeIndex;
-  }
-  assert(LoadSequence.size() <= MaxNumLoads && "broken invariant");
-}
-
-unsigned MemCmpExpansion::getNumBlocks() {
-  if (IsUsedForZeroCmp)
-    return getNumLoads() / NumLoadsPerBlock +
-           (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0);
-  return getNumLoads();
-}
-
-void MemCmpExpansion::createLoadCmpBlocks() {
-  for (unsigned i = 0; i < getNumBlocks(); i++) {
-    BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",
-                                        EndBlock->getParent(), EndBlock);
-    LoadCmpBlocks.push_back(BB);
-  }
-}
-
-void MemCmpExpansion::createResultBlock() {
-  ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",
-                                   EndBlock->getParent(), EndBlock);
-}
-
-// This function creates the IR instructions for loading and comparing 1 byte.
-// It loads 1 byte from each source of the memcmp parameters with the given
-// GEPIndex. It then subtracts the two loaded values and adds this result to the
-// final phi node for selecting the memcmp result.
-void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
-                                               unsigned GEPIndex) {
-  Value *Source1 = CI->getArgOperand(0);
-  Value *Source2 = CI->getArgOperand(1);
-
-  Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
-  Type *LoadSizeType = Type::getInt8Ty(CI->getContext());
-  // Cast source to LoadSizeType*.
-  if (Source1->getType() != LoadSizeType)
-    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
-  if (Source2->getType() != LoadSizeType)
-    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
-
-  // Get the base address using the GEPIndex.
-  if (GEPIndex != 0) {
-    Source1 = Builder.CreateGEP(LoadSizeType, Source1,
-                                ConstantInt::get(LoadSizeType, GEPIndex));
-    Source2 = Builder.CreateGEP(LoadSizeType, Source2,
-                                ConstantInt::get(LoadSizeType, GEPIndex));
-  }
-
-  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
-  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
-
-  LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext()));
-  LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext()));
-  Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
-
-  PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]);
-
-  if (BlockIndex < (LoadCmpBlocks.size() - 1)) {
-    // Early exit branch if difference found to EndBlock. Otherwise, continue to
-    // next LoadCmpBlock,
-    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
-                                    ConstantInt::get(Diff->getType(), 0));
-    BranchInst *CmpBr =
-        BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp);
-    Builder.Insert(CmpBr);
-  } else {
-    // The last block has an unconditional branch to EndBlock.
-    BranchInst *CmpBr = BranchInst::Create(EndBlock);
-    Builder.Insert(CmpBr);
-  }
-}
-
-/// Generate an equality comparison for one or more pairs of loaded values.
-/// This is used in the case where the memcmp() call is compared equal or not
-/// equal to zero.
-Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
-                                            unsigned &LoadIndex) {
-  assert(LoadIndex < getNumLoads() &&
-         "getCompareLoadPairs() called with no remaining loads");
-  std::vector<Value *> XorList, OrList;
-  Value *Diff;
-
-  const unsigned NumLoads =
-      std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock);
-
-  // For a single-block expansion, start inserting before the memcmp call.
-  if (LoadCmpBlocks.empty())
-    Builder.SetInsertPoint(CI);
-  else
-    Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
-
-  Value *Cmp = nullptr;
-  // If we have multiple loads per block, we need to generate a composite
-  // comparison using xor+or. The type for the combinations is the largest load
-  // type.
-  IntegerType *const MaxLoadType =
-      NumLoads == 1 ? nullptr
-                    : IntegerType::get(CI->getContext(), MaxLoadSize * 8);
-  for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
-    const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
-
-    IntegerType *LoadSizeType =
-        IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
-
-    Value *Source1 = CI->getArgOperand(0);
-    Value *Source2 = CI->getArgOperand(1);
-
-    // Cast source to LoadSizeType*.
-    if (Source1->getType() != LoadSizeType)
-      Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
-    if (Source2->getType() != LoadSizeType)
-      Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
-
-    // Get the base address using a GEP.
-    if (CurLoadEntry.Offset != 0) {
-      Source1 = Builder.CreateGEP(
-          LoadSizeType, Source1,
-          ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
-      Source2 = Builder.CreateGEP(
-          LoadSizeType, Source2,
-          ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
-    }
-
-    // Get a constant or load a value for each source address.
-    Value *LoadSrc1 = nullptr;
-    if (auto *Source1C = dyn_cast<Constant>(Source1))
-      LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL);
-    if (!LoadSrc1)
-      LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
-
-    Value *LoadSrc2 = nullptr;
-    if (auto *Source2C = dyn_cast<Constant>(Source2))
-      LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL);
-    if (!LoadSrc2)
-      LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
-
-    if (NumLoads != 1) {
-      if (LoadSizeType != MaxLoadType) {
-        LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
-        LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
-      }
-      // If we have multiple loads per block, we need to generate a composite
-      // comparison using xor+or.
-      Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
-      Diff = Builder.CreateZExt(Diff, MaxLoadType);
-      XorList.push_back(Diff);
-    } else {
-      // If there's only one load per block, we just compare the loaded values.
-      Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2);
-    }
-  }
-
-  auto pairWiseOr = [&](std::vector<Value *> &InList) -> std::vector<Value *> {
-    std::vector<Value *> OutList;
-    for (unsigned i = 0; i < InList.size() - 1; i = i + 2) {
-      Value *Or = Builder.CreateOr(InList[i], InList[i + 1]);
-      OutList.push_back(Or);
-    }
-    if (InList.size() % 2 != 0)
-      OutList.push_back(InList.back());
-    return OutList;
-  };
-
-  if (!Cmp) {
-    // Pairwise OR the XOR results.
-    OrList = pairWiseOr(XorList);
-
-    // Pairwise OR the OR results until one result left.
-    while (OrList.size() != 1) {
-      OrList = pairWiseOr(OrList);
-    }
-    Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0));
-  }
-
-  return Cmp;
-}
-
-void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
-                                                        unsigned &LoadIndex) {
-  Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex);
-
-  BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
-                           ? EndBlock
-                           : LoadCmpBlocks[BlockIndex + 1];
-  // Early exit branch if difference found to ResultBlock. Otherwise,
-  // continue to next LoadCmpBlock or EndBlock.
-  BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
-  Builder.Insert(CmpBr);
-
-  // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
-  // since early exit to ResultBlock was not taken (no difference was found in
-  // any of the bytes).
-  if (BlockIndex == LoadCmpBlocks.size() - 1) {
-    Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
-    PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);
-  }
-}
-
-// This function creates the IR intructions for loading and comparing using the
-// given LoadSize. It loads the number of bytes specified by LoadSize from each
-// source of the memcmp parameters. It then does a subtract to see if there was
-// a difference in the loaded values. If a difference is found, it branches
-// with an early exit to the ResultBlock for calculating which source was
-// larger. Otherwise, it falls through to the either the next LoadCmpBlock or
-// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with
-// a special case through emitLoadCompareByteBlock. The special handling can
-// simply subtract the loaded values and add it to the result phi node.
-void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
-  // There is one load per block in this case, BlockIndex == LoadIndex.
-  const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex];
-
-  if (CurLoadEntry.LoadSize == 1) {
-    MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex,
-                                              CurLoadEntry.getGEPIndex());
-    return;
-  }
-
-  Type *LoadSizeType =
-      IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
-  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
-  assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");
-
-  Value *Source1 = CI->getArgOperand(0);
-  Value *Source2 = CI->getArgOperand(1);
-
-  Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
-  // Cast source to LoadSizeType*.
-  if (Source1->getType() != LoadSizeType)
-    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
-  if (Source2->getType() != LoadSizeType)
-    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
-
-  // Get the base address using a GEP.
-  if (CurLoadEntry.Offset != 0) {
-    Source1 = Builder.CreateGEP(
-        LoadSizeType, Source1,
-        ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
-    Source2 = Builder.CreateGEP(
-        LoadSizeType, Source2,
-        ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
-  }
-
-  // Load LoadSizeType from the base address.
-  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
-  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
-
-  if (DL.isLittleEndian()) {
-    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
-                                                Intrinsic::bswap, LoadSizeType);
-    LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
-    LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
-  }
-
-  if (LoadSizeType != MaxLoadType) {
-    LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
-    LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
-  }
-
-  // Add the loaded values to the phi nodes for calculating memcmp result only
-  // if result is not used in a zero equality.
-  if (!IsUsedForZeroCmp) {
-    ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]);
-    ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]);
-  }
-
-  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2);
-  BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
-                           ? EndBlock
-                           : LoadCmpBlocks[BlockIndex + 1];
-  // Early exit branch if difference found to ResultBlock. Otherwise, continue
-  // to next LoadCmpBlock or EndBlock.
-  BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp);
-  Builder.Insert(CmpBr);
-
-  // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
-  // since early exit to ResultBlock was not taken (no difference was found in
-  // any of the bytes).
-  if (BlockIndex == LoadCmpBlocks.size() - 1) {
-    Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
-    PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);
-  }
-}
-
-// This function populates the ResultBlock with a sequence to calculate the
-// memcmp result. It compares the two loaded source values and returns -1 if
-// src1 < src2 and 1 if src1 > src2.
-void MemCmpExpansion::emitMemCmpResultBlock() {
-  // Special case: if memcmp result is used in a zero equality, result does not
-  // need to be calculated and can simply return 1.
-  if (IsUsedForZeroCmp) {
-    BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
-    Builder.SetInsertPoint(ResBlock.BB, InsertPt);
-    Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1);
-    PhiRes->addIncoming(Res, ResBlock.BB);
-    BranchInst *NewBr = BranchInst::Create(EndBlock);
-    Builder.Insert(NewBr);
-    return;
-  }
-  BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
-  Builder.SetInsertPoint(ResBlock.BB, InsertPt);
-
-  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1,
-                                  ResBlock.PhiSrc2);
-
-  Value *Res =
-      Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1),
-                           ConstantInt::get(Builder.getInt32Ty(), 1));
-
-  BranchInst *NewBr = BranchInst::Create(EndBlock);
-  Builder.Insert(NewBr);
-  PhiRes->addIncoming(Res, ResBlock.BB);
-}
-
-void MemCmpExpansion::setupResultBlockPHINodes() {
-  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
-  Builder.SetInsertPoint(ResBlock.BB);
-  // Note: this assumes one load per block.
-  ResBlock.PhiSrc1 =
-      Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1");
-  ResBlock.PhiSrc2 =
-      Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2");
-}
-
-void MemCmpExpansion::setupEndBlockPHINodes() {
-  Builder.SetInsertPoint(&EndBlock->front());
-  PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res");
-}
-
-Value *MemCmpExpansion::getMemCmpExpansionZeroCase() {
-  unsigned LoadIndex = 0;
-  // This loop populates each of the LoadCmpBlocks with the IR sequence to
-  // handle multiple loads per block.
-  for (unsigned I = 0; I < getNumBlocks(); ++I) {
-    emitLoadCompareBlockMultipleLoads(I, LoadIndex);
-  }
-
-  emitMemCmpResultBlock();
-  return PhiRes;
-}
-
-/// A memcmp expansion that compares equality with 0 and only has one block of
-/// load and compare can bypass the compare, branch, and phi IR that is required
-/// in the general case.
-Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
-  unsigned LoadIndex = 0;
-  Value *Cmp = getCompareLoadPairs(0, LoadIndex);
-  assert(LoadIndex == getNumLoads() && "some entries were not consumed");
-  return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));
-}
-
-/// A memcmp expansion that only has one block of load and compare can bypass
-/// the compare, branch, and phi IR that is required in the general case.
-Value *MemCmpExpansion::getMemCmpOneBlock() {
-  assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block");
-
-  Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
-  Value *Source1 = CI->getArgOperand(0);
-  Value *Source2 = CI->getArgOperand(1);
-
-  // Cast source to LoadSizeType*.
-  if (Source1->getType() != LoadSizeType)
-    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
-  if (Source2->getType() != LoadSizeType)
-    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
-
-  // Load LoadSizeType from the base address.
-  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
-  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
-
-  if (DL.isLittleEndian() && Size != 1) {
-    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
-                                                Intrinsic::bswap, LoadSizeType);
-    LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
-    LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
-  }
-
-  if (Size < 4) {
-    // The i8 and i16 cases don't need compares. We zext the loaded values and
-    // subtract them to get the suitable negative, zero, or positive i32 result.
-    LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty());
-    LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty());
-    return Builder.CreateSub(LoadSrc1, LoadSrc2);
-  }
-
-  // The result of memcmp is negative, zero, or positive, so produce that by
-  // subtracting 2 extended compare bits: sub (ugt, ult).
-  // If a target prefers to use selects to get -1/0/1, they should be able
-  // to transform this later. The inverse transform (going from selects to math)
-  // may not be possible in the DAG because the selects got converted into
-  // branches before we got there.
-  Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2);
-  Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2);
-  Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty());
-  Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty());
-  return Builder.CreateSub(ZextUGT, ZextULT);
-}
-
-// This function expands the memcmp call into an inline expansion and returns
-// the memcmp result.
-Value *MemCmpExpansion::getMemCmpExpansion() {
-  // A memcmp with zero-comparison with only one block of load and compare does
-  // not need to set up any extra blocks. This case could be handled in the DAG,
-  // but since we have all of the machinery to flexibly expand any memcpy here,
-  // we choose to handle this case too to avoid fragmented lowering.
-  if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) {
-    BasicBlock *StartBlock = CI->getParent();
-    EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
-    setupEndBlockPHINodes();
-    createResultBlock();
-
-    // If return value of memcmp is not used in a zero equality, we need to
-    // calculate which source was larger. The calculation requires the
-    // two loaded source values of each load compare block.
-    // These will be saved in the phi nodes created by setupResultBlockPHINodes.
-    if (!IsUsedForZeroCmp) setupResultBlockPHINodes();
-
-    // Create the number of required load compare basic blocks.
-    createLoadCmpBlocks();
-
-    // Update the terminator added by splitBasicBlock to branch to the first
-    // LoadCmpBlock.
-    StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
-  }
-
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  if (IsUsedForZeroCmp)
-    return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock()
-                               : getMemCmpExpansionZeroCase();
-
-  // TODO: Handle more than one load pair per block in getMemCmpOneBlock().
-  if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock();
-
-  for (unsigned I = 0; I < getNumBlocks(); ++I) {
-    emitLoadCompareBlock(I);
-  }
-
-  emitMemCmpResultBlock();
-  return PhiRes;
-}
-
-// This function checks to see if an expansion of memcmp can be generated.
-// It checks for constant compare size that is less than the max inline size.
-// If an expansion cannot occur, returns false to leave as a library call.
-// Otherwise, the library call is replaced with a new IR instruction sequence.
-/// We want to transform:
-/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
-/// To:
-/// loadbb:
-///  %0 = bitcast i32* %buffer2 to i8*
-///  %1 = bitcast i32* %buffer1 to i8*
-///  %2 = bitcast i8* %1 to i64*
-///  %3 = bitcast i8* %0 to i64*
-///  %4 = load i64, i64* %2
-///  %5 = load i64, i64* %3
-///  %6 = call i64 @llvm.bswap.i64(i64 %4)
-///  %7 = call i64 @llvm.bswap.i64(i64 %5)
-///  %8 = sub i64 %6, %7
-///  %9 = icmp ne i64 %8, 0
-///  br i1 %9, label %res_block, label %loadbb1
-/// res_block:                                        ; preds = %loadbb2,
-/// %loadbb1, %loadbb
-///  %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ]
-///  %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ]
-///  %10 = icmp ult i64 %phi.src1, %phi.src2
-///  %11 = select i1 %10, i32 -1, i32 1
-///  br label %endblock
-/// loadbb1:                                          ; preds = %loadbb
-///  %12 = bitcast i32* %buffer2 to i8*
-///  %13 = bitcast i32* %buffer1 to i8*
-///  %14 = bitcast i8* %13 to i32*
-///  %15 = bitcast i8* %12 to i32*
-///  %16 = getelementptr i32, i32* %14, i32 2
-///  %17 = getelementptr i32, i32* %15, i32 2
-///  %18 = load i32, i32* %16
-///  %19 = load i32, i32* %17
-///  %20 = call i32 @llvm.bswap.i32(i32 %18)
-///  %21 = call i32 @llvm.bswap.i32(i32 %19)
-///  %22 = zext i32 %20 to i64
-///  %23 = zext i32 %21 to i64
-///  %24 = sub i64 %22, %23
-///  %25 = icmp ne i64 %24, 0
-///  br i1 %25, label %res_block, label %loadbb2
-/// loadbb2:                                          ; preds = %loadbb1
-///  %26 = bitcast i32* %buffer2 to i8*
-///  %27 = bitcast i32* %buffer1 to i8*
-///  %28 = bitcast i8* %27 to i16*
-///  %29 = bitcast i8* %26 to i16*
-///  %30 = getelementptr i16, i16* %28, i16 6
-///  %31 = getelementptr i16, i16* %29, i16 6
-///  %32 = load i16, i16* %30
-///  %33 = load i16, i16* %31
-///  %34 = call i16 @llvm.bswap.i16(i16 %32)
-///  %35 = call i16 @llvm.bswap.i16(i16 %33)
-///  %36 = zext i16 %34 to i64
-///  %37 = zext i16 %35 to i64
-///  %38 = sub i64 %36, %37
-///  %39 = icmp ne i64 %38, 0
-///  br i1 %39, label %res_block, label %loadbb3
-/// loadbb3:                                          ; preds = %loadbb2
-///  %40 = bitcast i32* %buffer2 to i8*
-///  %41 = bitcast i32* %buffer1 to i8*
-///  %42 = getelementptr i8, i8* %41, i8 14
-///  %43 = getelementptr i8, i8* %40, i8 14
-///  %44 = load i8, i8* %42
-///  %45 = load i8, i8* %43
-///  %46 = zext i8 %44 to i32
-///  %47 = zext i8 %45 to i32
-///  %48 = sub i32 %46, %47
-///  br label %endblock
-/// endblock:                                         ; preds = %res_block,
-/// %loadbb3
-///  %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
-///  ret i32 %phi.res
-static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
-                         const TargetLowering *TLI, const DataLayout *DL) {
-  NumMemCmpCalls++;
-
-  // Early exit from expansion if -Oz.
-  if (CI->getFunction()->optForMinSize())
-    return false;
-
-  // Early exit from expansion if size is not a constant.
-  ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));
-  if (!SizeCast) {
-    NumMemCmpNotConstant++;
-    return false;
-  }
-  const uint64_t SizeVal = SizeCast->getZExtValue();
-
-  if (SizeVal == 0) {
-    return false;
-  }
-
-  // TTI call to check if target would like to expand memcmp. Also, get the
-  // available load sizes.
-  const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
-  const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp);
-  if (!Options) return false;
-
-  const unsigned MaxNumLoads =
-      TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize());
-
-  MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads,
-                            IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL);
-
-  // Don't expand if this will require more loads than desired by the target.
-  if (Expansion.getNumLoads() == 0) {
-    NumMemCmpGreaterThanMax++;
-    return false;
-  }
-
-  NumMemCmpInlined++;
-
-  Value *Res = Expansion.getMemCmpExpansion();
-
-  // Replace call with result of expansion and erase call.
-  CI->replaceAllUsesWith(Res);
-  CI->eraseFromParent();
-
-  return true;
-}
-
 bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
   BasicBlock *BB = CI->getParent();
 
@@ -2542,12 +1858,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
     return true;
   }
 
-  LibFunc Func;
-  if (TLInfo->getLibFunc(ImmutableCallSite(CI), Func) &&
-      Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TLI, DL)) {
-    ModifiedDT = true;
-    return true;
-  }
   return false;
 }
 
@@ -3377,8 +2687,65 @@ private:
                              Value *PromotedOperand) const;
 };
 
+/// \brief Keep track of simplification of Phi nodes.
+/// Accept the set of all phi nodes and erase phi node from this set
+/// if it is simplified.
+class SimplificationTracker {
+  DenseMap<Value *, Value *> Storage;
+  const SimplifyQuery &SQ;
+  SmallPtrSetImpl<PHINode *> &AllPhiNodes;
+  SmallPtrSetImpl<SelectInst *> &AllSelectNodes;
+
+public:
+  SimplificationTracker(const SimplifyQuery &sq,
+                        SmallPtrSetImpl<PHINode *> &APN,
+                        SmallPtrSetImpl<SelectInst *> &ASN)
+      : SQ(sq), AllPhiNodes(APN), AllSelectNodes(ASN) {}
+
+  Value *Get(Value *V) {
+    do {
+      auto SV = Storage.find(V);
+      if (SV == Storage.end())
+        return V;
+      V = SV->second;
+    } while (true);
+  }
+
+  Value *Simplify(Value *Val) {
+    SmallVector<Value *, 32> WorkList;
+    SmallPtrSet<Value *, 32> Visited;
+    WorkList.push_back(Val);
+    while (!WorkList.empty()) {
+      auto P = WorkList.pop_back_val();
+      if (!Visited.insert(P).second)
+        continue;
+      if (auto *PI = dyn_cast<Instruction>(P))
+        if (Value *V = SimplifyInstruction(cast<Instruction>(PI), SQ)) {
+          for (auto *U : PI->users())
+            WorkList.push_back(cast<Value>(U));
+          Put(PI, V);
+          PI->replaceAllUsesWith(V);
+          if (auto *PHI = dyn_cast<PHINode>(PI))
+            AllPhiNodes.erase(PHI);
+          if (auto *Select = dyn_cast<SelectInst>(PI))
+            AllSelectNodes.erase(Select);
+          PI->eraseFromParent();
+        }
+    }
+    return Get(Val);
+  }
+
+  void Put(Value *From, Value *To) {
+    Storage.insert({ From, To });
+  }
+};
+
 /// \brief A helper class for combining addressing modes.
 class AddressingModeCombiner {
+  typedef std::pair<Value *, BasicBlock *> ValueInBB;
+  typedef DenseMap<ValueInBB, Value *> FoldAddrToValueMapping;
+  typedef std::pair<PHINode *, PHINode *> PHIPair;
+
 private:
   /// The addressing modes we've collected.
   SmallVector<ExtAddrMode, 16> AddrModes;
@@ -3389,7 +2756,19 @@ private:
   /// Are the AddrModes that we have all just equal to their original values?
   bool AllAddrModesTrivial = true;
 
+  /// Common Type for all different fields in addressing modes.
+  Type *CommonType;
+
+  /// SimplifyQuery for simplifyInstruction utility.
+  const SimplifyQuery &SQ;
+
+  /// Original Address.
+  ValueInBB Original;
+
 public:
+  AddressingModeCombiner(const SimplifyQuery &_SQ, ValueInBB OriginalValue)
+      : CommonType(nullptr), SQ(_SQ), Original(OriginalValue) {}
+
   /// \brief Get the combined AddrMode
   const ExtAddrMode &getAddrMode() const {
     return AddrModes[0];
@@ -3457,12 +2836,356 @@ public:
     if (AllAddrModesTrivial)
       return false;
 
-    // TODO: Combine multiple AddrModes by inserting a select or phi for the
-    // field in which the AddrModes differ.
-    return false;
+    if (DisableComplexAddrModes)
+      return false;
+
+    // For now we support only different base registers.
+    // TODO: enable others.
+    if (DifferentField != ExtAddrMode::BaseRegField)
+      return false;
+
+    // Build a map between <original value, basic block where we saw it> to
+    // value of base register.
+    FoldAddrToValueMapping Map;
+    initializeMap(Map);
+
+    Value *CommonValue = findCommon(Map);
+    if (CommonValue)
+      AddrModes[0].BaseReg = CommonValue;
+    return CommonValue != nullptr;
+  }
+
+private:
+  /// \brief Initialize Map with anchor values. For address seen in some BB
+  /// we set the value of different field saw in this address.
+  /// If address is not an instruction than basic block is set to null.
+  /// At the same time we find a common type for different field we will
+  /// use to create new Phi/Select nodes. Keep it in CommonType field.
+  void initializeMap(FoldAddrToValueMapping &Map) {
+    // Keep track of keys where the value is null. We will need to replace it
+    // with constant null when we know the common type.
+    SmallVector<ValueInBB, 2> NullValue;
+    for (auto &AM : AddrModes) {
+      BasicBlock *BB = nullptr;
+      if (Instruction *I = dyn_cast<Instruction>(AM.OriginalValue))
+        BB = I->getParent();
+
+      // For now we support only base register as different field.
+      // TODO: Enable others.
+      Value *DV = AM.BaseReg;
+      if (DV) {
+        if (CommonType)
+          assert(CommonType == DV->getType() && "Different types detected!");
+        else
+          CommonType = DV->getType();
+        Map[{ AM.OriginalValue, BB }] = DV;
+      } else {
+        NullValue.push_back({ AM.OriginalValue, BB });
+      }
+    }
+    assert(CommonType && "At least one non-null value must be!");
+    for (auto VIBB : NullValue)
+      Map[VIBB] = Constant::getNullValue(CommonType);
+  }
+
+  /// \brief We have mapping between value A and basic block where value A
+  /// seen to other value B where B was a field in addressing mode represented
+  /// by A. Also we have an original value C representin an address in some
+  /// basic block. Traversing from C through phi and selects we ended up with
+  /// A's in a map. This utility function tries to find a value V which is a
+  /// field in addressing mode C and traversing through phi nodes and selects
+  /// we will end up in corresponded values B in a map.
+  /// The utility will create a new Phi/Selects if needed.
+  // The simple example looks as follows:
+  // BB1:
+  //   p1 = b1 + 40
+  //   br cond BB2, BB3
+  // BB2:
+  //   p2 = b2 + 40
+  //   br BB3
+  // BB3:
+  //   p = phi [p1, BB1], [p2, BB2]
+  //   v = load p
+  // Map is
+  //   <p1, BB1> -> b1
+  //   <p2, BB2> -> b2
+  // Request is
+  //   <p, BB3> -> ?
+  // The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3
+  Value *findCommon(FoldAddrToValueMapping &Map) {
+    // Tracks of new created Phi nodes.
+    SmallPtrSet<PHINode *, 32> NewPhiNodes;
+    // Tracks of new created Select nodes.
+    SmallPtrSet<SelectInst *, 32> NewSelectNodes;
+    // Tracks the simplification of new created phi nodes. The reason we use
+    // this mapping is because we will add new created Phi nodes in AddrToBase.
+    // Simplification of Phi nodes is recursive, so some Phi node may
+    // be simplified after we added it to AddrToBase.
+    // Using this mapping we can find the current value in AddrToBase.
+    SimplificationTracker ST(SQ, NewPhiNodes, NewSelectNodes);
+
+    // First step, DFS to create PHI nodes for all intermediate blocks.
+    // Also fill traverse order for the second step.
+    SmallVector<ValueInBB, 32> TraverseOrder;
+    InsertPlaceholders(Map, TraverseOrder, NewPhiNodes, NewSelectNodes);
+
+    // Second Step, fill new nodes by merged values and simplify if possible.
+    FillPlaceholders(Map, TraverseOrder, ST);
+
+    if (!AddrSinkNewSelects && NewSelectNodes.size() > 0) {
+      DestroyNodes(NewPhiNodes);
+      DestroyNodes(NewSelectNodes);
+      return nullptr;
+    }
+
+    // Now we'd like to match New Phi nodes to existed ones.
+    unsigned PhiNotMatchedCount = 0;
+    if (!MatchPhiSet(NewPhiNodes, ST, AddrSinkNewPhis, PhiNotMatchedCount)) {
+      DestroyNodes(NewPhiNodes);
+      DestroyNodes(NewSelectNodes);
+      return nullptr;
+    }
+
+    auto *Result = ST.Get(Map.find(Original)->second);
+    if (Result) {
+      NumMemoryInstsPhiCreated += NewPhiNodes.size() + PhiNotMatchedCount;
+      NumMemoryInstsSelectCreated += NewSelectNodes.size();
+    }
+    return Result;
+  }
+
+  /// \brief Destroy nodes from a set.
+  template <typename T> void DestroyNodes(SmallPtrSetImpl<T *> &Instructions) {
+    // For safe erasing, replace the Phi with dummy value first.
+    auto Dummy = UndefValue::get(CommonType);
+    for (auto I : Instructions) {
+      I->replaceAllUsesWith(Dummy);
+      I->eraseFromParent();
+    }
+  }
+
+  /// \brief Try to match PHI node to Candidate.
+  /// Matcher tracks the matched Phi nodes.
+  bool MatchPhiNode(PHINode *PHI, PHINode *Candidate,
+                    DenseSet<PHIPair> &Matcher,
+                    SmallPtrSetImpl<PHINode *> &PhiNodesToMatch) {
+    SmallVector<PHIPair, 8> WorkList;
+    Matcher.insert({ PHI, Candidate });
+    WorkList.push_back({ PHI, Candidate });
+    SmallSet<PHIPair, 8> Visited;
+    while (!WorkList.empty()) {
+      auto Item = WorkList.pop_back_val();
+      if (!Visited.insert(Item).second)
+        continue;
+      // We iterate over all incoming values to Phi to compare them.
+      // If values are different and both of them Phi and the first one is a
+      // Phi we added (subject to match) and both of them is in the same basic
+      // block then we can match our pair if values match. So we state that
+      // these values match and add it to work list to verify that.
+      for (auto B : Item.first->blocks()) {
+        Value *FirstValue = Item.first->getIncomingValueForBlock(B);
+        Value *SecondValue = Item.second->getIncomingValueForBlock(B);
+        if (FirstValue == SecondValue)
+          continue;
+
+        PHINode *FirstPhi = dyn_cast<PHINode>(FirstValue);
+        PHINode *SecondPhi = dyn_cast<PHINode>(SecondValue);
+
+        // One of them is not Phi or
+        // The first one is not Phi node from the set we'd like to match or
+        // Phi nodes from different basic blocks then
+        // we will not be able to match.
+        if (!FirstPhi || !SecondPhi || !PhiNodesToMatch.count(FirstPhi) ||
+            FirstPhi->getParent() != SecondPhi->getParent())
+          return false;
+
+        // If we already matched them then continue.
+        if (Matcher.count({ FirstPhi, SecondPhi }))
+          continue;
+        // So the values are different and does not match. So we need them to
+        // match.
+        Matcher.insert({ FirstPhi, SecondPhi });
+        // But me must check it.
+        WorkList.push_back({ FirstPhi, SecondPhi });
+      }
+    }
+    return true;
   }
-};
 
+  /// \brief For the given set of PHI nodes try to find their equivalents.
+  /// Returns false if this matching fails and creation of new Phi is disabled.
+  bool MatchPhiSet(SmallPtrSetImpl<PHINode *> &PhiNodesToMatch,
+                   SimplificationTracker &ST, bool AllowNewPhiNodes,
+                   unsigned &PhiNotMatchedCount) {
+    DenseSet<PHIPair> Matched;
+    SmallPtrSet<PHINode *, 8> WillNotMatch;
+    while (PhiNodesToMatch.size()) {
+      PHINode *PHI = *PhiNodesToMatch.begin();
+
+      // Add us, if no Phi nodes in the basic block we do not match.
+      WillNotMatch.clear();
+      WillNotMatch.insert(PHI);
+
+      // Traverse all Phis until we found equivalent or fail to do that.
+      bool IsMatched = false;
+      for (auto &P : PHI->getParent()->phis()) {
+        if (&P == PHI)
+          continue;
+        if ((IsMatched = MatchPhiNode(PHI, &P, Matched, PhiNodesToMatch)))
+          break;
+        // If it does not match, collect all Phi nodes from matcher.
+        // if we end up with no match, them all these Phi nodes will not match
+        // later.
+        for (auto M : Matched)
+          WillNotMatch.insert(M.first);
+        Matched.clear();
+      }
+      if (IsMatched) {
+        // Replace all matched values and erase them.
+        for (auto MV : Matched) {
+          MV.first->replaceAllUsesWith(MV.second);
+          PhiNodesToMatch.erase(MV.first);
+          ST.Put(MV.first, MV.second);
+          MV.first->eraseFromParent();
+        }
+        Matched.clear();
+        continue;
+      }
+      // If we are not allowed to create new nodes then bail out.
+      if (!AllowNewPhiNodes)
+        return false;
+      // Just remove all seen values in matcher. They will not match anything.
+      PhiNotMatchedCount += WillNotMatch.size();
+      for (auto *P : WillNotMatch)
+        PhiNodesToMatch.erase(P);
+    }
+    return true;
+  }
+  /// \brief Fill the placeholder with values from predecessors and simplify it.
+  void FillPlaceholders(FoldAddrToValueMapping &Map,
+                        SmallVectorImpl<ValueInBB> &TraverseOrder,
+                        SimplificationTracker &ST) {
+    while (!TraverseOrder.empty()) {
+      auto Current = TraverseOrder.pop_back_val();
+      assert(Map.find(Current) != Map.end() && "No node to fill!!!");
+      Value *CurrentValue = Current.first;
+      BasicBlock *CurrentBlock = Current.second;
+      Value *V = Map[Current];
+
+      if (SelectInst *Select = dyn_cast<SelectInst>(V)) {
+        // CurrentValue also must be Select.
+        auto *CurrentSelect = cast<SelectInst>(CurrentValue);
+        auto *TrueValue = CurrentSelect->getTrueValue();
+        ValueInBB TrueItem = { TrueValue, isa<Instruction>(TrueValue)
+                                              ? CurrentBlock
+                                              : nullptr };
+        assert(Map.find(TrueItem) != Map.end() && "No True Value!");
+        Select->setTrueValue(Map[TrueItem]);
+        auto *FalseValue = CurrentSelect->getFalseValue();
+        ValueInBB FalseItem = { FalseValue, isa<Instruction>(FalseValue)
+                                                ? CurrentBlock
+                                                : nullptr };
+        assert(Map.find(FalseItem) != Map.end() && "No False Value!");
+        Select->setFalseValue(Map[FalseItem]);
+      } else {
+        // Must be a Phi node then.
+        PHINode *PHI = cast<PHINode>(V);
+        // Fill the Phi node with values from predecessors.
+        bool IsDefinedInThisBB =
+            cast<Instruction>(CurrentValue)->getParent() == CurrentBlock;
+        auto *CurrentPhi = dyn_cast<PHINode>(CurrentValue);
+        for (auto B : predecessors(CurrentBlock)) {
+          Value *PV = IsDefinedInThisBB
+                          ? CurrentPhi->getIncomingValueForBlock(B)
+                          : CurrentValue;
+          ValueInBB item = { PV, isa<Instruction>(PV) ? B : nullptr };
+          assert(Map.find(item) != Map.end() && "No predecessor Value!");
+          PHI->addIncoming(ST.Get(Map[item]), B);
+        }
+      }
+      // Simplify if possible.
+      Map[Current] = ST.Simplify(V);
+    }
+  }
+
+  /// Starting from value recursively iterates over predecessors up to known
+  /// ending values represented in a map. For each traversed block inserts
+  /// a placeholder Phi or Select.
+  /// Reports all new created Phi/Select nodes by adding them to set.
+  /// Also reports and order in what basic blocks have been traversed.
+  void InsertPlaceholders(FoldAddrToValueMapping &Map,
+                          SmallVectorImpl<ValueInBB> &TraverseOrder,
+                          SmallPtrSetImpl<PHINode *> &NewPhiNodes,
+                          SmallPtrSetImpl<SelectInst *> &NewSelectNodes) {
+    SmallVector<ValueInBB, 32> Worklist;
+    assert((isa<PHINode>(Original.first) || isa<SelectInst>(Original.first)) &&
+           "Address must be a Phi or Select node");
+    auto *Dummy = UndefValue::get(CommonType);
+    Worklist.push_back(Original);
+    while (!Worklist.empty()) {
+      auto Current = Worklist.pop_back_val();
+      // If value is not an instruction it is something global, constant,
+      // parameter and we can say that this value is observable in any block.
+      // Set block to null to denote it.
+      // Also please take into account that it is how we build anchors.
+      if (!isa<Instruction>(Current.first))
+        Current.second = nullptr;
+      // if it is already visited or it is an ending value then skip it.
+      if (Map.find(Current) != Map.end())
+        continue;
+      TraverseOrder.push_back(Current);
+
+      Value *CurrentValue = Current.first;
+      BasicBlock *CurrentBlock = Current.second;
+      // CurrentValue must be a Phi node or select. All others must be covered
+      // by anchors.
+      Instruction *CurrentI = cast<Instruction>(CurrentValue);
+      bool IsDefinedInThisBB = CurrentI->getParent() == CurrentBlock;
+
+      unsigned PredCount =
+          std::distance(pred_begin(CurrentBlock), pred_end(CurrentBlock));
+      // if Current Value is not defined in this basic block we are interested
+      // in values in predecessors.
+      if (!IsDefinedInThisBB) {
+        assert(PredCount && "Unreachable block?!");
+        PHINode *PHI = PHINode::Create(CommonType, PredCount, "sunk_phi",
+                                       &CurrentBlock->front());
+        Map[Current] = PHI;
+        NewPhiNodes.insert(PHI);
+        // Add all predecessors in work list.
+        for (auto B : predecessors(CurrentBlock))
+          Worklist.push_back({ CurrentValue, B });
+        continue;
+      }
+      // Value is defined in this basic block.
+      if (SelectInst *OrigSelect = dyn_cast<SelectInst>(CurrentI)) {
+        // Is it OK to get metadata from OrigSelect?!
+        // Create a Select placeholder with dummy value.
+        SelectInst *Select =
+            SelectInst::Create(OrigSelect->getCondition(), Dummy, Dummy,
+                               OrigSelect->getName(), OrigSelect, OrigSelect);
+        Map[Current] = Select;
+        NewSelectNodes.insert(Select);
+        // We are interested in True and False value in this basic block.
+        Worklist.push_back({ OrigSelect->getTrueValue(), CurrentBlock });
+        Worklist.push_back({ OrigSelect->getFalseValue(), CurrentBlock });
+      } else {
+        // It must be a Phi node then.
+        auto *CurrentPhi = cast<PHINode>(CurrentI);
+        // Create new Phi node for merge of bases.
+        assert(PredCount && "Unreachable block?!");
+        PHINode *PHI = PHINode::Create(CommonType, PredCount, "sunk_phi",
+                                       &CurrentBlock->front());
+        Map[Current] = PHI;
+        NewPhiNodes.insert(PHI);
+
+        // Add all predecessors in work list.
+        for (auto B : predecessors(CurrentBlock))
+          Worklist.push_back({ CurrentPhi->getIncomingValueForBlock(B), B });
+      }
+    }
+  }
+};
 } // end anonymous namespace
 
 /// Try adding ScaleReg*Scale to the current addressing mode.
@@ -4555,7 +4278,8 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   // the graph are compatible.
   bool PhiOrSelectSeen = false;
   SmallVector<Instruction*, 16> AddrModeInsts;
-  AddressingModeCombiner AddrModes;
+  const SimplifyQuery SQ(*DL, TLInfo);
+  AddressingModeCombiner AddrModes(SQ, { Addr, MemoryInst->getParent() });
   TypePromotionTransaction TPT(RemovedInsts);
   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
       TPT.getRestorationPoint();
@@ -6715,7 +6439,7 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {
       Instruction *Insn = &*BI++;
       DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
       // Leave dbg.values that refer to an alloca alone. These
-      // instrinsics describe the address of a variable (= the alloca)
+      // intrinsics describe the address of a variable (= the alloca)
       // being taken.  They should not be moved next to the alloca
       // (and to the beginning of the scope), but rather stay close to
       // where said address is used.
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index a791c01c48b..9ef172274c1 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -26,11 +26,11 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <cassert>
diff --git a/lib/CodeGen/DFAPacketizer.cpp b/lib/CodeGen/DFAPacketizer.cpp
index cf21316ec22..0123afb4cd8 100644
--- a/lib/CodeGen/DFAPacketizer.cpp
+++ b/lib/CodeGen/DFAPacketizer.cpp
@@ -29,12 +29,12 @@
 #include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 #include <cassert>
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index 91d18e2bcaa..ef1452087f2 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -15,10 +15,10 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
diff --git a/lib/CodeGen/DetectDeadLanes.cpp b/lib/CodeGen/DetectDeadLanes.cpp
index ab9a0592e01..2613471714e 100644
--- a/lib/CodeGen/DetectDeadLanes.cpp
+++ b/lib/CodeGen/DetectDeadLanes.cpp
@@ -34,12 +34,12 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
index 402afe75b14..cb6c3ae04c7 100644
--- a/lib/CodeGen/EarlyIfConversion.cpp
+++ b/lib/CodeGen/EarlyIfConversion.cpp
@@ -30,10 +30,10 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineTraceMetrics.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index e272d25047e..28f716ee6c2 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -15,10 +15,10 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
diff --git a/lib/CodeGen/ExpandMemCmp.cpp b/lib/CodeGen/ExpandMemCmp.cpp
new file mode 100644
index 00000000000..c5910c18d89
--- /dev/null
+++ b/lib/CodeGen/ExpandMemCmp.cpp
@@ -0,0 +1,828 @@
+//===--- ExpandMemCmp.cpp - Expand memcmp() to load/stores ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to partially inline the fast path of well-known library
+// functions, such as using square-root instructions for cases where sqrt()
+// does not need to set errno.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "expandmemcmp"
+
+STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
+STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
+STATISTIC(NumMemCmpGreaterThanMax,
+          "Number of memcmp calls with size greater than max size");
+STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
+
+static cl::opt<unsigned> MemCmpNumLoadsPerBlock(
+    "memcmp-num-loads-per-block", cl::Hidden, cl::init(1),
+    cl::desc("The number of loads per basic block for inline expansion of "
+             "memcmp that is only being compared against zero."));
+
+namespace {
+
+
+// This class provides helper functions to expand a memcmp library call into an
+// inline expansion.
+class MemCmpExpansion {
+  struct ResultBlock {
+    BasicBlock *BB = nullptr;
+    PHINode *PhiSrc1 = nullptr;
+    PHINode *PhiSrc2 = nullptr;
+
+    ResultBlock() = default;
+  };
+
+  CallInst *const CI;
+  ResultBlock ResBlock;
+  const uint64_t Size;
+  unsigned MaxLoadSize;
+  uint64_t NumLoadsNonOneByte;
+  const uint64_t NumLoadsPerBlock;
+  std::vector<BasicBlock *> LoadCmpBlocks;
+  BasicBlock *EndBlock;
+  PHINode *PhiRes;
+  const bool IsUsedForZeroCmp;
+  const DataLayout &DL;
+  IRBuilder<> Builder;
+  // Represents the decomposition in blocks of the expansion. For example,
+  // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and
+  // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}.
+  // TODO(courbet): Involve the target more in this computation. On X86, 7
+  // bytes can be done more efficiently with two overlaping 4-byte loads than
+  // covering the interval with [{4, 0},{2, 4},{1, 6}}.
+  struct LoadEntry {
+    LoadEntry(unsigned LoadSize, uint64_t Offset)
+        : LoadSize(LoadSize), Offset(Offset) {
+      assert(Offset % LoadSize == 0 && "invalid load entry");
+    }
+
+    uint64_t getGEPIndex() const { return Offset / LoadSize; }
+
+    // The size of the load for this block, in bytes.
+    const unsigned LoadSize;
+    // The offset of this load WRT the base pointer, in bytes.
+    const uint64_t Offset;
+  };
+  SmallVector<LoadEntry, 8> LoadSequence;
+
+  void createLoadCmpBlocks();
+  void createResultBlock();
+  void setupResultBlockPHINodes();
+  void setupEndBlockPHINodes();
+  Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex);
+  void emitLoadCompareBlock(unsigned BlockIndex);
+  void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
+                                         unsigned &LoadIndex);
+  void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned GEPIndex);
+  void emitMemCmpResultBlock();
+  Value *getMemCmpExpansionZeroCase();
+  Value *getMemCmpEqZeroOneBlock();
+  Value *getMemCmpOneBlock();
+
+ public:
+  MemCmpExpansion(CallInst *CI, uint64_t Size,
+                  const TargetTransformInfo::MemCmpExpansionOptions &Options,
+                  unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
+                  unsigned NumLoadsPerBlock, const DataLayout &DL);
+
+  unsigned getNumBlocks();
+  uint64_t getNumLoads() const { return LoadSequence.size(); }
+
+  Value *getMemCmpExpansion();
+};
+
+// Initialize the basic block structure required for expansion of memcmp call
+// with given maximum load size and memcmp size parameter.
+// This structure includes:
+// 1. A list of load compare blocks - LoadCmpBlocks.
+// 2. An EndBlock, split from original instruction point, which is the block to
+// return from.
+// 3. ResultBlock, block to branch to for early exit when a
+// LoadCmpBlock finds a difference.
+MemCmpExpansion::MemCmpExpansion(
+    CallInst *const CI, uint64_t Size,
+    const TargetTransformInfo::MemCmpExpansionOptions &Options,
+    const unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
+    const unsigned NumLoadsPerBlock, const DataLayout &TheDataLayout)
+    : CI(CI),
+      Size(Size),
+      MaxLoadSize(0),
+      NumLoadsNonOneByte(0),
+      NumLoadsPerBlock(NumLoadsPerBlock),
+      IsUsedForZeroCmp(IsUsedForZeroCmp),
+      DL(TheDataLayout),
+      Builder(CI) {
+  assert(Size > 0 && "zero blocks");
+  // Scale the max size down if the target can load more bytes than we need.
+  size_t LoadSizeIndex = 0;
+  while (LoadSizeIndex < Options.LoadSizes.size() &&
+         Options.LoadSizes[LoadSizeIndex] > Size) {
+    ++LoadSizeIndex;
+  }
+  this->MaxLoadSize = Options.LoadSizes[LoadSizeIndex];
+  // Compute the decomposition.
+  uint64_t CurSize = Size;
+  uint64_t Offset = 0;
+  while (CurSize && LoadSizeIndex < Options.LoadSizes.size()) {
+    const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex];
+    assert(LoadSize > 0 && "zero load size");
+    const uint64_t NumLoadsForThisSize = CurSize / LoadSize;
+    if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) {
+      // Do not expand if the total number of loads is larger than what the
+      // target allows. Note that it's important that we exit before completing
+      // the expansion to avoid using a ton of memory to store the expansion for
+      // large sizes.
+      LoadSequence.clear();
+      return;
+    }
+    if (NumLoadsForThisSize > 0) {
+      for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) {
+        LoadSequence.push_back({LoadSize, Offset});
+        Offset += LoadSize;
+      }
+      if (LoadSize > 1) {
+        ++NumLoadsNonOneByte;
+      }
+      CurSize = CurSize % LoadSize;
+    }
+    ++LoadSizeIndex;
+  }
+  assert(LoadSequence.size() <= MaxNumLoads && "broken invariant");
+}
+
+unsigned MemCmpExpansion::getNumBlocks() {
+  if (IsUsedForZeroCmp)
+    return getNumLoads() / NumLoadsPerBlock +
+           (getNumLoads() % NumLoadsPerBlock != 0 ? 1 : 0);
+  return getNumLoads();
+}
+
+void MemCmpExpansion::createLoadCmpBlocks() {
+  for (unsigned i = 0; i < getNumBlocks(); i++) {
+    BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",
+                                        EndBlock->getParent(), EndBlock);
+    LoadCmpBlocks.push_back(BB);
+  }
+}
+
+void MemCmpExpansion::createResultBlock() {
+  ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",
+                                   EndBlock->getParent(), EndBlock);
+}
+
+// This function creates the IR instructions for loading and comparing 1 byte.
+// It loads 1 byte from each source of the memcmp parameters with the given
+// GEPIndex. It then subtracts the two loaded values and adds this result to the
+// final phi node for selecting the memcmp result.
+void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
+                                               unsigned GEPIndex) {
+  Value *Source1 = CI->getArgOperand(0);
+  Value *Source2 = CI->getArgOperand(1);
+
+  Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
+  Type *LoadSizeType = Type::getInt8Ty(CI->getContext());
+  // Cast source to LoadSizeType*.
+  if (Source1->getType() != LoadSizeType)
+    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+  if (Source2->getType() != LoadSizeType)
+    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+  // Get the base address using the GEPIndex.
+  if (GEPIndex != 0) {
+    Source1 = Builder.CreateGEP(LoadSizeType, Source1,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+    Source2 = Builder.CreateGEP(LoadSizeType, Source2,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+  }
+
+  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+  LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext()));
+  LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext()));
+  Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
+
+  PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]);
+
+  if (BlockIndex < (LoadCmpBlocks.size() - 1)) {
+    // Early exit branch if difference found to EndBlock. Otherwise, continue to
+    // next LoadCmpBlock,
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
+                                    ConstantInt::get(Diff->getType(), 0));
+    BranchInst *CmpBr =
+        BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp);
+    Builder.Insert(CmpBr);
+  } else {
+    // The last block has an unconditional branch to EndBlock.
+    BranchInst *CmpBr = BranchInst::Create(EndBlock);
+    Builder.Insert(CmpBr);
+  }
+}
+
+/// Generate an equality comparison for one or more pairs of loaded values.
+/// This is used in the case where the memcmp() call is compared equal or not
+/// equal to zero.
+Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
+                                            unsigned &LoadIndex) {
+  assert(LoadIndex < getNumLoads() &&
+         "getCompareLoadPairs() called with no remaining loads");
+  std::vector<Value *> XorList, OrList;
+  Value *Diff;
+
+  const unsigned NumLoads =
+      std::min(getNumLoads() - LoadIndex, NumLoadsPerBlock);
+
+  // For a single-block expansion, start inserting before the memcmp call.
+  if (LoadCmpBlocks.empty())
+    Builder.SetInsertPoint(CI);
+  else
+    Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
+
+  Value *Cmp = nullptr;
+  // If we have multiple loads per block, we need to generate a composite
+  // comparison using xor+or. The type for the combinations is the largest load
+  // type.
+  IntegerType *const MaxLoadType =
+      NumLoads == 1 ? nullptr
+                    : IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+  for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
+    const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
+
+    IntegerType *LoadSizeType =
+        IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
+
+    Value *Source1 = CI->getArgOperand(0);
+    Value *Source2 = CI->getArgOperand(1);
+
+    // Cast source to LoadSizeType*.
+    if (Source1->getType() != LoadSizeType)
+      Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+    if (Source2->getType() != LoadSizeType)
+      Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+    // Get the base address using a GEP.
+    if (CurLoadEntry.Offset != 0) {
+      Source1 = Builder.CreateGEP(
+          LoadSizeType, Source1,
+          ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
+      Source2 = Builder.CreateGEP(
+          LoadSizeType, Source2,
+          ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
+    }
+
+    // Get a constant or load a value for each source address.
+    Value *LoadSrc1 = nullptr;
+    if (auto *Source1C = dyn_cast<Constant>(Source1))
+      LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL);
+    if (!LoadSrc1)
+      LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+
+    Value *LoadSrc2 = nullptr;
+    if (auto *Source2C = dyn_cast<Constant>(Source2))
+      LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL);
+    if (!LoadSrc2)
+      LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+    if (NumLoads != 1) {
+      if (LoadSizeType != MaxLoadType) {
+        LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
+        LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
+      }
+      // If we have multiple loads per block, we need to generate a composite
+      // comparison using xor+or.
+      Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
+      Diff = Builder.CreateZExt(Diff, MaxLoadType);
+      XorList.push_back(Diff);
+    } else {
+      // If there's only one load per block, we just compare the loaded values.
+      Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2);
+    }
+  }
+
+  auto pairWiseOr = [&](std::vector<Value *> &InList) -> std::vector<Value *> {
+    std::vector<Value *> OutList;
+    for (unsigned i = 0; i < InList.size() - 1; i = i + 2) {
+      Value *Or = Builder.CreateOr(InList[i], InList[i + 1]);
+      OutList.push_back(Or);
+    }
+    if (InList.size() % 2 != 0)
+      OutList.push_back(InList.back());
+    return OutList;
+  };
+
+  if (!Cmp) {
+    // Pairwise OR the XOR results.
+    OrList = pairWiseOr(XorList);
+
+    // Pairwise OR the OR results until one result left.
+    while (OrList.size() != 1) {
+      OrList = pairWiseOr(OrList);
+    }
+    Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0));
+  }
+
+  return Cmp;
+}
+
+void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
+                                                        unsigned &LoadIndex) {
+  Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex);
+
+  BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
+                           ? EndBlock
+                           : LoadCmpBlocks[BlockIndex + 1];
+  // Early exit branch if difference found to ResultBlock. Otherwise,
+  // continue to next LoadCmpBlock or EndBlock.
+  BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
+  Builder.Insert(CmpBr);
+
+  // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
+  // since early exit to ResultBlock was not taken (no difference was found in
+  // any of the bytes).
+  if (BlockIndex == LoadCmpBlocks.size() - 1) {
+    Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
+    PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);
+  }
+}
+
+// This function creates the IR intructions for loading and comparing using the
+// given LoadSize. It loads the number of bytes specified by LoadSize from each
+// source of the memcmp parameters. It then does a subtract to see if there was
+// a difference in the loaded values. If a difference is found, it branches
+// with an early exit to the ResultBlock for calculating which source was
+// larger. Otherwise, it falls through to the either the next LoadCmpBlock or
+// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with
+// a special case through emitLoadCompareByteBlock. The special handling can
+// simply subtract the loaded values and add it to the result phi node.
+void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
+  // There is one load per block in this case, BlockIndex == LoadIndex.
+  const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex];
+
+  if (CurLoadEntry.LoadSize == 1) {
+    MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex,
+                                              CurLoadEntry.getGEPIndex());
+    return;
+  }
+
+  Type *LoadSizeType =
+      IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
+  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+  assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");
+
+  Value *Source1 = CI->getArgOperand(0);
+  Value *Source2 = CI->getArgOperand(1);
+
+  Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
+  // Cast source to LoadSizeType*.
+  if (Source1->getType() != LoadSizeType)
+    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+  if (Source2->getType() != LoadSizeType)
+    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+  // Get the base address using a GEP.
+  if (CurLoadEntry.Offset != 0) {
+    Source1 = Builder.CreateGEP(
+        LoadSizeType, Source1,
+        ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
+    Source2 = Builder.CreateGEP(
+        LoadSizeType, Source2,
+        ConstantInt::get(LoadSizeType, CurLoadEntry.getGEPIndex()));
+  }
+
+  // Load LoadSizeType from the base address.
+  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+  if (DL.isLittleEndian()) {
+    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
+                                                Intrinsic::bswap, LoadSizeType);
+    LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
+    LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
+  }
+
+  if (LoadSizeType != MaxLoadType) {
+    LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
+    LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
+  }
+
+  // Add the loaded values to the phi nodes for calculating memcmp result only
+  // if result is not used in a zero equality.
+  if (!IsUsedForZeroCmp) {
+    ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]);
+    ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]);
+  }
+
+  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2);
+  BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
+                           ? EndBlock
+                           : LoadCmpBlocks[BlockIndex + 1];
+  // Early exit branch if difference found to ResultBlock. Otherwise, continue
+  // to next LoadCmpBlock or EndBlock.
+  BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp);
+  Builder.Insert(CmpBr);
+
+  // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
+  // since early exit to ResultBlock was not taken (no difference was found in
+  // any of the bytes).
+  if (BlockIndex == LoadCmpBlocks.size() - 1) {
+    Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
+    PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);
+  }
+}
+
+// This function populates the ResultBlock with a sequence to calculate the
+// memcmp result. It compares the two loaded source values and returns -1 if
+// src1 < src2 and 1 if src1 > src2.
+void MemCmpExpansion::emitMemCmpResultBlock() {
+  // Special case: if memcmp result is used in a zero equality, result does not
+  // need to be calculated and can simply return 1.
+  if (IsUsedForZeroCmp) {
+    BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
+    Builder.SetInsertPoint(ResBlock.BB, InsertPt);
+    Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1);
+    PhiRes->addIncoming(Res, ResBlock.BB);
+    BranchInst *NewBr = BranchInst::Create(EndBlock);
+    Builder.Insert(NewBr);
+    return;
+  }
+  BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
+  Builder.SetInsertPoint(ResBlock.BB, InsertPt);
+
+  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1,
+                                  ResBlock.PhiSrc2);
+
+  Value *Res =
+      Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1),
+                           ConstantInt::get(Builder.getInt32Ty(), 1));
+
+  BranchInst *NewBr = BranchInst::Create(EndBlock);
+  Builder.Insert(NewBr);
+  PhiRes->addIncoming(Res, ResBlock.BB);
+}
+
+void MemCmpExpansion::setupResultBlockPHINodes() {
+  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+  Builder.SetInsertPoint(ResBlock.BB);
+  // Note: this assumes one load per block.
+  ResBlock.PhiSrc1 =
+      Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1");
+  ResBlock.PhiSrc2 =
+      Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2");
+}
+
+void MemCmpExpansion::setupEndBlockPHINodes() {
+  Builder.SetInsertPoint(&EndBlock->front());
+  PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res");
+}
+
+Value *MemCmpExpansion::getMemCmpExpansionZeroCase() {
+  unsigned LoadIndex = 0;
+  // This loop populates each of the LoadCmpBlocks with the IR sequence to
+  // handle multiple loads per block.
+  for (unsigned I = 0; I < getNumBlocks(); ++I) {
+    emitLoadCompareBlockMultipleLoads(I, LoadIndex);
+  }
+
+  emitMemCmpResultBlock();
+  return PhiRes;
+}
+
+/// A memcmp expansion that compares equality with 0 and only has one block of
+/// load and compare can bypass the compare, branch, and phi IR that is required
+/// in the general case.
+Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
+  unsigned LoadIndex = 0;
+  Value *Cmp = getCompareLoadPairs(0, LoadIndex);
+  assert(LoadIndex == getNumLoads() && "some entries were not consumed");
+  return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));
+}
+
+/// A memcmp expansion that only has one block of load and compare can bypass
+/// the compare, branch, and phi IR that is required in the general case.
+Value *MemCmpExpansion::getMemCmpOneBlock() {
+  assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block");
+
+  Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
+  Value *Source1 = CI->getArgOperand(0);
+  Value *Source2 = CI->getArgOperand(1);
+
+  // Cast source to LoadSizeType*.
+  if (Source1->getType() != LoadSizeType)
+    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+  if (Source2->getType() != LoadSizeType)
+    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+  // Load LoadSizeType from the base address.
+  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+  if (DL.isLittleEndian() && Size != 1) {
+    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
+                                                Intrinsic::bswap, LoadSizeType);
+    LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
+    LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
+  }
+
+  if (Size < 4) {
+    // The i8 and i16 cases don't need compares. We zext the loaded values and
+    // subtract them to get the suitable negative, zero, or positive i32 result.
+    LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty());
+    LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty());
+    return Builder.CreateSub(LoadSrc1, LoadSrc2);
+  }
+
+  // The result of memcmp is negative, zero, or positive, so produce that by
+  // subtracting 2 extended compare bits: sub (ugt, ult).
+  // If a target prefers to use selects to get -1/0/1, they should be able
+  // to transform this later. The inverse transform (going from selects to math)
+  // may not be possible in the DAG because the selects got converted into
+  // branches before we got there.
+  Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2);
+  Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2);
+  Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty());
+  Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty());
+  return Builder.CreateSub(ZextUGT, ZextULT);
+}
+
+// This function expands the memcmp call into an inline expansion and returns
+// the memcmp result.
+Value *MemCmpExpansion::getMemCmpExpansion() {
+  // A memcmp with zero-comparison with only one block of load and compare does
+  // not need to set up any extra blocks. This case could be handled in the DAG,
+  // but since we have all of the machinery to flexibly expand any memcpy here,
+  // we choose to handle this case too to avoid fragmented lowering.
+  if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || getNumBlocks() != 1) {
+    BasicBlock *StartBlock = CI->getParent();
+    EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
+    setupEndBlockPHINodes();
+    createResultBlock();
+
+    // If return value of memcmp is not used in a zero equality, we need to
+    // calculate which source was larger. The calculation requires the
+    // two loaded source values of each load compare block.
+    // These will be saved in the phi nodes created by setupResultBlockPHINodes.
+    if (!IsUsedForZeroCmp) setupResultBlockPHINodes();
+
+    // Create the number of required load compare basic blocks.
+    createLoadCmpBlocks();
+
+    // Update the terminator added by splitBasicBlock to branch to the first
+    // LoadCmpBlock.
+    StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
+  }
+
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  if (IsUsedForZeroCmp)
+    return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock()
+                               : getMemCmpExpansionZeroCase();
+
+  // TODO: Handle more than one load pair per block in getMemCmpOneBlock().
+  if (getNumBlocks() == 1 && NumLoadsPerBlock == 1) return getMemCmpOneBlock();
+
+  for (unsigned I = 0; I < getNumBlocks(); ++I) {
+    emitLoadCompareBlock(I);
+  }
+
+  emitMemCmpResultBlock();
+  return PhiRes;
+}
+
+// This function checks to see if an expansion of memcmp can be generated.
+// It checks for constant compare size that is less than the max inline size.
+// If an expansion cannot occur, returns false to leave as a library call.
+// Otherwise, the library call is replaced with a new IR instruction sequence.
+/// We want to transform:
+/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
+/// To:
+/// loadbb:
+///  %0 = bitcast i32* %buffer2 to i8*
+///  %1 = bitcast i32* %buffer1 to i8*
+///  %2 = bitcast i8* %1 to i64*
+///  %3 = bitcast i8* %0 to i64*
+///  %4 = load i64, i64* %2
+///  %5 = load i64, i64* %3
+///  %6 = call i64 @llvm.bswap.i64(i64 %4)
+///  %7 = call i64 @llvm.bswap.i64(i64 %5)
+///  %8 = sub i64 %6, %7
+///  %9 = icmp ne i64 %8, 0
+///  br i1 %9, label %res_block, label %loadbb1
+/// res_block:                                        ; preds = %loadbb2,
+/// %loadbb1, %loadbb
+///  %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ]
+///  %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ]
+///  %10 = icmp ult i64 %phi.src1, %phi.src2
+///  %11 = select i1 %10, i32 -1, i32 1
+///  br label %endblock
+/// loadbb1:                                          ; preds = %loadbb
+///  %12 = bitcast i32* %buffer2 to i8*
+///  %13 = bitcast i32* %buffer1 to i8*
+///  %14 = bitcast i8* %13 to i32*
+///  %15 = bitcast i8* %12 to i32*
+///  %16 = getelementptr i32, i32* %14, i32 2
+///  %17 = getelementptr i32, i32* %15, i32 2
+///  %18 = load i32, i32* %16
+///  %19 = load i32, i32* %17
+///  %20 = call i32 @llvm.bswap.i32(i32 %18)
+///  %21 = call i32 @llvm.bswap.i32(i32 %19)
+///  %22 = zext i32 %20 to i64
+///  %23 = zext i32 %21 to i64
+///  %24 = sub i64 %22, %23
+///  %25 = icmp ne i64 %24, 0
+///  br i1 %25, label %res_block, label %loadbb2
+/// loadbb2:                                          ; preds = %loadbb1
+///  %26 = bitcast i32* %buffer2 to i8*
+///  %27 = bitcast i32* %buffer1 to i8*
+///  %28 = bitcast i8* %27 to i16*
+///  %29 = bitcast i8* %26 to i16*
+///  %30 = getelementptr i16, i16* %28, i16 6
+///  %31 = getelementptr i16, i16* %29, i16 6
+///  %32 = load i16, i16* %30
+///  %33 = load i16, i16* %31
+///  %34 = call i16 @llvm.bswap.i16(i16 %32)
+///  %35 = call i16 @llvm.bswap.i16(i16 %33)
+///  %36 = zext i16 %34 to i64
+///  %37 = zext i16 %35 to i64
+///  %38 = sub i64 %36, %37
+///  %39 = icmp ne i64 %38, 0
+///  br i1 %39, label %res_block, label %loadbb3
+/// loadbb3:                                          ; preds = %loadbb2
+///  %40 = bitcast i32* %buffer2 to i8*
+///  %41 = bitcast i32* %buffer1 to i8*
+///  %42 = getelementptr i8, i8* %41, i8 14
+///  %43 = getelementptr i8, i8* %40, i8 14
+///  %44 = load i8, i8* %42
+///  %45 = load i8, i8* %43
+///  %46 = zext i8 %44 to i32
+///  %47 = zext i8 %45 to i32
+///  %48 = sub i32 %46, %47
+///  br label %endblock
+/// endblock:                                         ; preds = %res_block,
+/// %loadbb3
+///  %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
+///  ret i32 %phi.res
+static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
+                         const TargetLowering *TLI, const DataLayout *DL) {
+  NumMemCmpCalls++;
+
+  // Early exit from expansion if -Oz.
+  if (CI->getFunction()->optForMinSize())
+    return false;
+
+  // Early exit from expansion if size is not a constant.
+  ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  if (!SizeCast) {
+    NumMemCmpNotConstant++;
+    return false;
+  }
+  const uint64_t SizeVal = SizeCast->getZExtValue();
+
+  if (SizeVal == 0) {
+    return false;
+  }
+
+  // TTI call to check if target would like to expand memcmp. Also, get the
+  // available load sizes.
+  const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
+  const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp);
+  if (!Options) return false;
+
+  const unsigned MaxNumLoads =
+      TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize());
+
+  MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads,
+                            IsUsedForZeroCmp, MemCmpNumLoadsPerBlock, *DL);
+
+  // Don't expand if this will require more loads than desired by the target.
+  if (Expansion.getNumLoads() == 0) {
+    NumMemCmpGreaterThanMax++;
+    return false;
+  }
+
+  NumMemCmpInlined++;
+
+  Value *Res = Expansion.getMemCmpExpansion();
+
+  // Replace call with result of expansion and erase call.
+  CI->replaceAllUsesWith(Res);
+  CI->eraseFromParent();
+
+  return true;
+}
+
+
+
+class ExpandMemCmpPass : public FunctionPass {
+public:
+  static char ID;
+
+  ExpandMemCmpPass() : FunctionPass(ID) {
+    initializeExpandMemCmpPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F)) return false;
+
+    auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+    if (!TPC) {
+      return false;
+    }
+    const TargetLowering* TL =
+        TPC->getTM<TargetMachine>().getSubtargetImpl(F)->getTargetLowering();
+
+    const TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    const TargetTransformInfo *TTI =
+        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto PA = runImpl(F, TLI, TTI, TL);
+    return !PA.areAllPreserved();
+  }
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
+                            const TargetTransformInfo *TTI,
+                            const TargetLowering* TL);
+  // Returns true if a change was made.
+  bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
+                  const TargetTransformInfo *TTI, const TargetLowering* TL,
+                  const DataLayout& DL);
+};
+
+bool ExpandMemCmpPass::runOnBlock(
+    BasicBlock &BB, const TargetLibraryInfo *TLI,
+    const TargetTransformInfo *TTI, const TargetLowering* TL,
+    const DataLayout& DL) {
+  for (Instruction& I : BB) {
+    CallInst *CI = dyn_cast<CallInst>(&I);
+    if (!CI) {
+      continue;
+    }
+    LibFunc Func;
+    if (TLI->getLibFunc(ImmutableCallSite(CI), Func) &&
+        Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TL, &DL)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+
+PreservedAnalyses ExpandMemCmpPass::runImpl(
+    Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI,
+    const TargetLowering* TL) {
+  const DataLayout& DL = F.getParent()->getDataLayout();
+  bool MadeChanges = false;
+  for (auto BBIt = F.begin(); BBIt != F.end();) {
+    if (runOnBlock(*BBIt, TLI, TTI, TL, DL)) {
+      MadeChanges = true;
+      // If changes were made, restart the function from the beginning, since
+      // the structure of the function was changed.
+      BBIt = F.begin();
+    } else {
+      ++BBIt;
+    }
+  }
+  return MadeChanges ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+} // namespace
+
+char ExpandMemCmpPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ExpandMemCmpPass, "expandmemcmp",
+                      "Expand memcmp() to load/stores", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp",
+                    "Expand memcmp() to load/stores", false, false)
+
+FunctionPass *llvm::createExpandMemCmpPass() {
+  return new ExpandMemCmpPass();
+}
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index 4ce86f27a7d..b73aeb18382 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -17,9 +17,9 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
diff --git a/lib/CodeGen/ExpandReductions.cpp b/lib/CodeGen/ExpandReductions.cpp
index 70dca3b74b2..abf487a4f19 100644
--- a/lib/CodeGen/ExpandReductions.cpp
+++ b/lib/CodeGen/ExpandReductions.cpp
@@ -95,7 +95,7 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
       // and it can't be handled by generating this shuffle sequence.
       // TODO: Implement scalarization of ordered reductions here for targets
       // without native support.
-      if (!II->getFastMathFlags().unsafeAlgebra())
+      if (!II->getFastMathFlags().isFast())
         continue;
       Vec = II->getArgOperand(1);
       break;
diff --git a/lib/CodeGen/FEntryInserter.cpp b/lib/CodeGen/FEntryInserter.cpp
index 9781338f952..07cb7016139 100644
--- a/lib/CodeGen/FEntryInserter.cpp
+++ b/lib/CodeGen/FEntryInserter.cpp
@@ -15,10 +15,10 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
diff --git a/lib/CodeGen/GCRootLowering.cpp b/lib/CodeGen/GCRootLowering.cpp
index 35246545ca9..2064ce7ac7b 100644
--- a/lib/CodeGen/GCRootLowering.cpp
+++ b/lib/CodeGen/GCRootLowering.cpp
@@ -18,14 +18,14 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 8e31ed0a015..45eb605c3c2 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -54,7 +54,7 @@
 #include "llvm/Support/LowLevelTypeImpl.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/lib/CodeGen/GlobalISel/Legalizer.cpp b/lib/CodeGen/GlobalISel/Legalizer.cpp
index fb954f3c3f1..59f34d730d0 100644
--- a/lib/CodeGen/GlobalISel/Legalizer.cpp
+++ b/lib/CodeGen/GlobalISel/Legalizer.cpp
@@ -20,9 +20,9 @@
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
 #include <iterator>
diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 99f605abe06..a8cfe0b89a0 100644
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -173,12 +173,18 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
 
   MIRBuilder.setInstr(MI);
 
+  int64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+  int64_t NarrowSize = NarrowTy.getSizeInBits();
+
   switch (MI.getOpcode()) {
   default:
     return UnableToLegalize;
   case TargetOpcode::G_IMPLICIT_DEF: {
-    int NumParts = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() /
-                   NarrowTy.getSizeInBits();
+    // FIXME: add support for when SizeOp0 isn't an exact multiple of
+    // NarrowSize.
+    if (SizeOp0 % NarrowSize != 0)
+      return UnableToLegalize;
+    int NumParts = SizeOp0 / NarrowSize;
 
     SmallVector<unsigned, 2> DstRegs;
     for (int i = 0; i < NumParts; ++i) {
@@ -191,9 +197,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     return Legalized;
   }
   case TargetOpcode::G_ADD: {
+    // FIXME: add support for when SizeOp0 isn't an exact multiple of
+    // NarrowSize.
+    if (SizeOp0 % NarrowSize != 0)
+      return UnableToLegalize;
     // Expand in terms of carry-setting/consuming G_ADDE instructions.
-    int NumParts = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() /
-                   NarrowTy.getSizeInBits();
+    int NumParts = SizeOp0 / NarrowTy.getSizeInBits();
 
     SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs;
     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs);
@@ -221,9 +230,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     if (TypeIdx != 1)
       return UnableToLegalize;
 
-    int64_t NarrowSize = NarrowTy.getSizeInBits();
-    int NumParts =
-        MRI.getType(MI.getOperand(1).getReg()).getSizeInBits() / NarrowSize;
+    int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+    // FIXME: add support for when SizeOp1 isn't an exact multiple of
+    // NarrowSize.
+    if (SizeOp1 % NarrowSize != 0)
+      return UnableToLegalize;
+    int NumParts = SizeOp1 / NarrowSize;
 
     SmallVector<unsigned, 2> SrcRegs, DstRegs;
     SmallVector<uint64_t, 2> Indexes;
@@ -270,12 +282,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     return Legalized;
   }
   case TargetOpcode::G_INSERT: {
-    if (TypeIdx != 0)
+    // FIXME: add support for when SizeOp0 isn't an exact multiple of
+    // NarrowSize.
+    if (SizeOp0 % NarrowSize != 0)
       return UnableToLegalize;
 
-    int64_t NarrowSize = NarrowTy.getSizeInBits();
-    int NumParts =
-        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+    int NumParts = SizeOp0 / NarrowSize;
 
     SmallVector<unsigned, 2> SrcRegs, DstRegs;
     SmallVector<uint64_t, 2> Indexes;
@@ -330,9 +342,11 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     return Legalized;
   }
   case TargetOpcode::G_LOAD: {
-    unsigned NarrowSize = NarrowTy.getSizeInBits();
-    int NumParts =
-        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+    // FIXME: add support for when SizeOp0 isn't an exact multiple of
+    // NarrowSize.
+    if (SizeOp0 % NarrowSize != 0)
+      return UnableToLegalize;
+    int NumParts = SizeOp0 / NarrowSize;
     LLT OffsetTy = LLT::scalar(
         MRI.getType(MI.getOperand(1).getReg()).getScalarSizeInBits());
 
@@ -357,9 +371,11 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     return Legalized;
   }
   case TargetOpcode::G_STORE: {
-    unsigned NarrowSize = NarrowTy.getSizeInBits();
-    int NumParts =
-        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+    // FIXME: add support for when SizeOp0 isn't an exact multiple of
+    // NarrowSize.
+    if (SizeOp0 % NarrowSize != 0)
+      return UnableToLegalize;
+    int NumParts = SizeOp0 / NarrowSize;
     LLT OffsetTy = LLT::scalar(
         MRI.getType(MI.getOperand(1).getReg()).getScalarSizeInBits());
 
@@ -381,9 +397,11 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     return Legalized;
   }
   case TargetOpcode::G_CONSTANT: {
-    unsigned NarrowSize = NarrowTy.getSizeInBits();
-    int NumParts =
-        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+    // FIXME: add support for when SizeOp0 isn't an exact multiple of
+    // NarrowSize.
+    if (SizeOp0 % NarrowSize != 0)
+      return UnableToLegalize;
+    int NumParts = SizeOp0 / NarrowSize;
     const APInt &Cst = MI.getOperand(1).getCImm()->getValue();
     LLVMContext &Ctx = MIRBuilder.getMF().getFunction()->getContext();
 
@@ -410,9 +428,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     // ...
     // AN = BinOp<Ty/N> BN, CN
     // A = G_MERGE_VALUES A1, ..., AN
-    unsigned NarrowSize = NarrowTy.getSizeInBits();
-    int NumParts =
-        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+
+    // FIXME: add support for when SizeOp0 isn't an exact multiple of
+    // NarrowSize.
+    if (SizeOp0 % NarrowSize != 0)
+      return UnableToLegalize;
+    int NumParts = SizeOp0 / NarrowSize;
 
     // List the registers where the destination will be scattered.
     SmallVector<unsigned, 2> DstRegs;
@@ -854,7 +875,12 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case TargetOpcode::G_ADD: {
     unsigned NarrowSize = NarrowTy.getSizeInBits();
     unsigned DstReg = MI.getOperand(0).getReg();
-    int NumParts = MRI.getType(DstReg).getSizeInBits() / NarrowSize;
+    unsigned Size = MRI.getType(DstReg).getSizeInBits();
+    int NumParts = Size / NarrowSize;
+    // FIXME: Don't know how to handle the situation where the small vectors
+    // aren't all the same size yet.
+    if (Size % NarrowSize != 0)
+      return UnableToLegalize;
 
     MIRBuilder.setInstr(MI);
 
diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index e7a46eadb44..074cfa61a29 100644
--- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -28,46 +28,130 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include <algorithm>
-#include <cassert>
-#include <tuple>
-#include <utility>
-
+#include <map>
 using namespace llvm;
 
-LegalizerInfo::LegalizerInfo() {
-  DefaultActions[TargetOpcode::G_IMPLICIT_DEF] = NarrowScalar;
-
-  // FIXME: these two can be legalized to the fundamental load/store Jakob
-  // proposed. Once loads & stores are supported.
-  DefaultActions[TargetOpcode::G_ANYEXT] = Legal;
-  DefaultActions[TargetOpcode::G_TRUNC] = Legal;
+LegalizerInfo::LegalizerInfo() : TablesInitialized(false) {
+  // Set defaults.
+  // FIXME: these two (G_ANYEXT and G_TRUNC?) can be legalized to the
+  // fundamental load/store Jakob proposed. Once loads & stores are supported.
+  setScalarAction(TargetOpcode::G_ANYEXT, 1, {{1, Legal}});
+  setScalarAction(TargetOpcode::G_ZEXT, 1, {{1, Legal}});
+  setScalarAction(TargetOpcode::G_SEXT, 1, {{1, Legal}});
+  setScalarAction(TargetOpcode::G_TRUNC, 0, {{1, Legal}});
+  setScalarAction(TargetOpcode::G_TRUNC, 1, {{1, Legal}});
 
-  DefaultActions[TargetOpcode::G_INTRINSIC] = Legal;
-  DefaultActions[TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS] = Legal;
+  setScalarAction(TargetOpcode::G_INTRINSIC, 0, {{1, Legal}});
+  setScalarAction(TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS, 0, {{1, Legal}});
 
-  DefaultActions[TargetOpcode::G_ADD] = NarrowScalar;
-  DefaultActions[TargetOpcode::G_LOAD] = NarrowScalar;
-  DefaultActions[TargetOpcode::G_STORE] = NarrowScalar;
-  DefaultActions[TargetOpcode::G_OR] = NarrowScalar;
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_IMPLICIT_DEF, 0, narrowToSmallerAndUnsupportedIfTooSmall);
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_ADD, 0, widenToLargerTypesAndNarrowToLargest);
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_OR, 0, widenToLargerTypesAndNarrowToLargest);
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_LOAD, 0, narrowToSmallerAndUnsupportedIfTooSmall);
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_STORE, 0, narrowToSmallerAndUnsupportedIfTooSmall);
 
-  DefaultActions[TargetOpcode::G_BRCOND] = WidenScalar;
-  DefaultActions[TargetOpcode::G_INSERT] = NarrowScalar;
-  DefaultActions[TargetOpcode::G_EXTRACT] = NarrowScalar;
-  DefaultActions[TargetOpcode::G_FNEG] = Lower;
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_BRCOND, 0, widenToLargerTypesUnsupportedOtherwise);
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_INSERT, 0, narrowToSmallerAndUnsupportedIfTooSmall);
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_EXTRACT, 0, narrowToSmallerAndUnsupportedIfTooSmall);
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_EXTRACT, 1, narrowToSmallerAndUnsupportedIfTooSmall);
+  setScalarAction(TargetOpcode::G_FNEG, 0, {{1, Lower}});
 }
 
 void LegalizerInfo::computeTables() {
-  for (unsigned Opcode = 0; Opcode <= LastOp - FirstOp; ++Opcode) {
-    for (unsigned Idx = 0, End = Actions[Opcode].size(); Idx != End; ++Idx) {
-      for (auto &Action : Actions[Opcode][Idx]) {
-        LLT Ty = Action.first;
-        if (!Ty.isVector())
-          continue;
-
-        auto &Entry = MaxLegalVectorElts[std::make_pair(Opcode + FirstOp,
-                                                        Ty.getElementType())];
-        Entry = std::max(Entry, Ty.getNumElements());
+  assert(TablesInitialized == false);
+
+  for (unsigned OpcodeIdx = 0; OpcodeIdx <= LastOp - FirstOp; ++OpcodeIdx) {
+    const unsigned Opcode = FirstOp + OpcodeIdx;
+    for (unsigned TypeIdx = 0; TypeIdx != SpecifiedActions[OpcodeIdx].size();
+         ++TypeIdx) {
+      // 0. Collect information specified through the setAction API, i.e.
+      // for specific bit sizes.
+      // For scalar types:
+      SizeAndActionsVec ScalarSpecifiedActions;
+      // For pointer types:
+      std::map<uint16_t, SizeAndActionsVec> AddressSpace2SpecifiedActions;
+      // For vector types:
+      std::map<uint16_t, SizeAndActionsVec> ElemSize2SpecifiedActions;
+      for (auto LLT2Action : SpecifiedActions[OpcodeIdx][TypeIdx]) {
+        const LLT Type = LLT2Action.first;
+        const LegalizeAction Action = LLT2Action.second;
+
+        auto SizeAction = std::make_pair(Type.getSizeInBits(), Action);
+        if (Type.isPointer())
+          AddressSpace2SpecifiedActions[Type.getAddressSpace()].push_back(
+              SizeAction);
+        else if (Type.isVector())
+          ElemSize2SpecifiedActions[Type.getElementType().getSizeInBits()]
+              .push_back(SizeAction);
+        else
+          ScalarSpecifiedActions.push_back(SizeAction);
+      }
+
+      // 1. Handle scalar types
+      {
+        // Decide how to handle bit sizes for which no explicit specification
+        // was given.
+        SizeChangeStrategy S = &unsupportedForDifferentSizes;
+        if (TypeIdx < ScalarSizeChangeStrategies[OpcodeIdx].size() &&
+            ScalarSizeChangeStrategies[OpcodeIdx][TypeIdx] != nullptr)
+          S = ScalarSizeChangeStrategies[OpcodeIdx][TypeIdx];
+        std::sort(ScalarSpecifiedActions.begin(), ScalarSpecifiedActions.end());
+        checkPartialSizeAndActionsVector(ScalarSpecifiedActions);
+        setScalarAction(Opcode, TypeIdx, S(ScalarSpecifiedActions));
       }
+
+      // 2. Handle pointer types
+      for (auto PointerSpecifiedActions : AddressSpace2SpecifiedActions) {
+        std::sort(PointerSpecifiedActions.second.begin(),
+                  PointerSpecifiedActions.second.end());
+        checkPartialSizeAndActionsVector(PointerSpecifiedActions.second);
+        // For pointer types, we assume that there isn't a meaningfull way
+        // to change the number of bits used in the pointer.
+        setPointerAction(
+            Opcode, TypeIdx, PointerSpecifiedActions.first,
+            unsupportedForDifferentSizes(PointerSpecifiedActions.second));
+      }
+
+      // 3. Handle vector types
+      SizeAndActionsVec ElementSizesSeen;
+      for (auto VectorSpecifiedActions : ElemSize2SpecifiedActions) {
+        std::sort(VectorSpecifiedActions.second.begin(),
+                  VectorSpecifiedActions.second.end());
+        const uint16_t ElementSize = VectorSpecifiedActions.first;
+        ElementSizesSeen.push_back({ElementSize, Legal});
+        checkPartialSizeAndActionsVector(VectorSpecifiedActions.second);
+        // For vector types, we assume that the best way to adapt the number
+        // of elements is to the next larger number of elements type for which
+        // the vector type is legal, unless there is no such type. In that case,
+        // legalize towards a vector type with a smaller number of elements.
+        SizeAndActionsVec NumElementsActions;
+        for (SizeAndAction BitsizeAndAction : VectorSpecifiedActions.second) {
+          assert(BitsizeAndAction.first % ElementSize == 0);
+          const uint16_t NumElements = BitsizeAndAction.first / ElementSize;
+          NumElementsActions.push_back({NumElements, BitsizeAndAction.second});
+        }
+        setVectorNumElementAction(
+            Opcode, TypeIdx, ElementSize,
+            moreToWiderTypesAndLessToWidest(NumElementsActions));
+      }
+      std::sort(ElementSizesSeen.begin(), ElementSizesSeen.end());
+      SizeChangeStrategy VectorElementSizeChangeStrategy =
+          &unsupportedForDifferentSizes;
+      if (TypeIdx < VectorElementSizeChangeStrategies[OpcodeIdx].size() &&
+          VectorElementSizeChangeStrategies[OpcodeIdx][TypeIdx] != nullptr)
+        VectorElementSizeChangeStrategy =
+            VectorElementSizeChangeStrategies[OpcodeIdx][TypeIdx];
+      setScalarInVectorAction(
+          Opcode, TypeIdx, VectorElementSizeChangeStrategy(ElementSizesSeen));
     }
   }
 
@@ -90,69 +174,24 @@ LegalizerInfo::getAction(const InstrAspect &Aspect) const {
       Aspect.Opcode == TargetOpcode::G_UNMERGE_VALUES)
     return std::make_pair(Legal, Aspect.Type);
 
-  LLT Ty = Aspect.Type;
-  LegalizeAction Action = findInActions(Aspect);
-  // LegalizerHelper is not able to handle non-power-of-2 types right now, so do
-  // not try to legalize them unless they are marked as Legal or Custom.
-  // FIXME: This is a temporary hack until the general non-power-of-2
-  // legalization works.
-  if (!isPowerOf2_64(Ty.getSizeInBits()) &&
-      !(Action == Legal || Action == Custom))
-    return std::make_pair(Unsupported, LLT());
-
-  if (Action != NotFound)
-    return findLegalAction(Aspect, Action);
-
-  unsigned Opcode = Aspect.Opcode;
-  if (!Ty.isVector()) {
-    auto DefaultAction = DefaultActions.find(Aspect.Opcode);
-    if (DefaultAction != DefaultActions.end() && DefaultAction->second == Legal)
-      return std::make_pair(Legal, Ty);
-
-    if (DefaultAction != DefaultActions.end() && DefaultAction->second == Lower)
-      return std::make_pair(Lower, Ty);
-
-    if (DefaultAction == DefaultActions.end() ||
-        DefaultAction->second != NarrowScalar)
-      return std::make_pair(Unsupported, LLT());
-    return findLegalAction(Aspect, NarrowScalar);
-  }
-
-  LLT EltTy = Ty.getElementType();
-  int NumElts = Ty.getNumElements();
-
-  auto ScalarAction = ScalarInVectorActions.find(std::make_pair(Opcode, EltTy));
-  if (ScalarAction != ScalarInVectorActions.end() &&
-      ScalarAction->second != Legal)
-    return findLegalAction(Aspect, ScalarAction->second);
-
-  // The element type is legal in principle, but the number of elements is
-  // wrong.
-  auto MaxLegalElts = MaxLegalVectorElts.lookup(std::make_pair(Opcode, EltTy));
-  if (MaxLegalElts > NumElts)
-    return findLegalAction(Aspect, MoreElements);
-
-  if (MaxLegalElts == 0) {
-    // Scalarize if there's no legal vector type, which is just a special case
-    // of FewerElements.
-    return std::make_pair(FewerElements, EltTy);
-  }
-
-  return findLegalAction(Aspect, FewerElements);
+  if (Aspect.Type.isScalar() || Aspect.Type.isPointer())
+    return findScalarLegalAction(Aspect);
+  assert(Aspect.Type.isVector());
+  return findVectorLegalAction(Aspect);
 }
 
 std::tuple<LegalizerInfo::LegalizeAction, unsigned, LLT>
 LegalizerInfo::getAction(const MachineInstr &MI,
                          const MachineRegisterInfo &MRI) const {
   SmallBitVector SeenTypes(8);
-  const MCInstrDesc &MCID = MI.getDesc();
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) {
+  const MCOperandInfo *OpInfo = MI.getDesc().OpInfo;
+  // FIXME: probably we'll need to cache the results here somehow?
+  for (unsigned i = 0; i < MI.getDesc().getNumOperands(); ++i) {
     if (!OpInfo[i].isGenericType())
       continue;
 
-    // We don't want to repeatedly check the same operand index, that
-    // could get expensive.
+    // We must only record actions once for each TypeIdx; otherwise we'd
+    // try to legalize operands multiple times down the line.
     unsigned TypeIdx = OpInfo[i].getGenericTypeIndex();
     if (SeenTypes[TypeIdx])
       continue;
@@ -172,38 +211,166 @@ bool LegalizerInfo::isLegal(const MachineInstr &MI,
   return std::get<0>(getAction(MI, MRI)) == Legal;
 }
 
-Optional<LLT> LegalizerInfo::findLegalType(const InstrAspect &Aspect,
-                                 LegalizeAction Action) const {
-  switch(Action) {
-  default:
-    llvm_unreachable("Cannot find legal type");
+bool LegalizerInfo::legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                   MachineIRBuilder &MIRBuilder) const {
+  return false;
+}
+
+LegalizerInfo::SizeAndActionsVec
+LegalizerInfo::increaseToLargerTypesAndDecreaseToLargest(
+    const SizeAndActionsVec &v, LegalizeAction IncreaseAction,
+    LegalizeAction DecreaseAction) {
+  SizeAndActionsVec result;
+  unsigned LargestSizeSoFar = 0;
+  if (v.size() >= 1 && v[0].first != 1)
+    result.push_back({1, IncreaseAction});
+  for (size_t i = 0; i < v.size(); ++i) {
+    result.push_back(v[i]);
+    LargestSizeSoFar = v[i].first;
+    if (i + 1 < v.size() && v[i + 1].first != v[i].first + 1) {
+      result.push_back({LargestSizeSoFar + 1, IncreaseAction});
+      LargestSizeSoFar = v[i].first + 1;
+    }
+  }
+  result.push_back({LargestSizeSoFar + 1, DecreaseAction});
+  return result;
+}
+
+LegalizerInfo::SizeAndActionsVec
+LegalizerInfo::decreaseToSmallerTypesAndIncreaseToSmallest(
+    const SizeAndActionsVec &v, LegalizeAction DecreaseAction,
+    LegalizeAction IncreaseAction) {
+  SizeAndActionsVec result;
+  if (v.size() == 0 || v[0].first != 1)
+    result.push_back({1, IncreaseAction});
+  for (size_t i = 0; i < v.size(); ++i) {
+    result.push_back(v[i]);
+    if (i + 1 == v.size() || v[i + 1].first != v[i].first + 1) {
+      result.push_back({v[i].first + 1, DecreaseAction});
+    }
+  }
+  return result;
+}
+
+LegalizerInfo::SizeAndAction
+LegalizerInfo::findAction(const SizeAndActionsVec &Vec, const uint32_t Size) {
+  assert(Size >= 1);
+  // Find the last element in Vec that has a bitsize equal to or smaller than
+  // the requested bit size.
+  // That is the element just before the first element that is bigger than Size.
+  auto VecIt = std::upper_bound(
+      Vec.begin(), Vec.end(), Size,
+      [](const uint32_t Size, const SizeAndAction lhs) -> bool {
+        return Size < lhs.first;
+      });
+  assert(VecIt != Vec.begin() && "Does Vec not start with size 1?");
+  --VecIt;
+  int VecIdx = VecIt - Vec.begin();
+
+  LegalizeAction Action = Vec[VecIdx].second;
+  switch (Action) {
   case Legal:
   case Lower:
   case Libcall:
   case Custom:
-    return Aspect.Type;
+    return {Size, Action};
+  case FewerElements:
+    // FIXME: is this special case still needed and correct?
+    // Special case for scalarization:
+    if (Vec == SizeAndActionsVec({{1, FewerElements}}))
+      return {1, FewerElements};
+    LLVM_FALLTHROUGH;
   case NarrowScalar: {
-    return findLegalizableSize(
-        Aspect, [&](LLT Ty) -> LLT { return Ty.halfScalarSize(); });
-  }
-  case WidenScalar: {
-    return findLegalizableSize(Aspect, [&](LLT Ty) -> LLT {
-      return Ty.getSizeInBits() < 8 ? LLT::scalar(8) : Ty.doubleScalarSize();
-    });
-  }
-  case FewerElements: {
-    return findLegalizableSize(
-        Aspect, [&](LLT Ty) -> LLT { return Ty.halfElements(); });
+    // The following needs to be a loop, as for now, we do allow needing to
+    // go over "Unsupported" bit sizes before finding a legalizable bit size.
+    // e.g. (s8, WidenScalar), (s9, Unsupported), (s32, Legal). if Size==8,
+    // we need to iterate over s9, and then to s32 to return (s32, Legal).
+    // If we want to get rid of the below loop, we should have stronger asserts
+    // when building the SizeAndActionsVecs, probably not allowing
+    // "Unsupported" unless at the ends of the vector.
+    for (int i = VecIdx - 1; i >= 0; --i)
+      if (!needsLegalizingToDifferentSize(Vec[i].second) &&
+          Vec[i].second != Unsupported)
+        return {Vec[i].first, Action};
+    llvm_unreachable("");
   }
+  case WidenScalar:
   case MoreElements: {
-    return findLegalizableSize(
-        Aspect, [&](LLT Ty) -> LLT { return Ty.doubleElements(); });
+    // See above, the following needs to be a loop, at least for now.
+    for (std::size_t i = VecIdx + 1; i < Vec.size(); ++i)
+      if (!needsLegalizingToDifferentSize(Vec[i].second) &&
+          Vec[i].second != Unsupported)
+        return {Vec[i].first, Action};
+    llvm_unreachable("");
   }
+  case Unsupported:
+    return {Size, Unsupported};
+  case NotFound:
+    llvm_unreachable("NotFound");
   }
+  llvm_unreachable("Action has an unknown enum value");
 }
 
-bool LegalizerInfo::legalizeCustom(MachineInstr &MI,
-                                   MachineRegisterInfo &MRI,
-                                   MachineIRBuilder &MIRBuilder) const {
-  return false;
+std::pair<LegalizerInfo::LegalizeAction, LLT>
+LegalizerInfo::findScalarLegalAction(const InstrAspect &Aspect) const {
+  assert(Aspect.Type.isScalar() || Aspect.Type.isPointer());
+  if (Aspect.Opcode < FirstOp || Aspect.Opcode > LastOp)
+    return {NotFound, LLT()};
+  const unsigned OpcodeIdx = Aspect.Opcode - FirstOp;
+  if (Aspect.Type.isPointer() &&
+      AddrSpace2PointerActions[OpcodeIdx].find(Aspect.Type.getAddressSpace()) ==
+          AddrSpace2PointerActions[OpcodeIdx].end()) {
+    return {NotFound, LLT()};
+  }
+  const SmallVector<SizeAndActionsVec, 1> &Actions =
+      Aspect.Type.isPointer()
+          ? AddrSpace2PointerActions[OpcodeIdx]
+                .find(Aspect.Type.getAddressSpace())
+                ->second
+          : ScalarActions[OpcodeIdx];
+  if (Aspect.Idx >= Actions.size())
+    return {NotFound, LLT()};
+  const SizeAndActionsVec &Vec = Actions[Aspect.Idx];
+  // FIXME: speed up this search, e.g. by using a results cache for repeated
+  // queries?
+  auto SizeAndAction = findAction(Vec, Aspect.Type.getSizeInBits());
+  return {SizeAndAction.second,
+          Aspect.Type.isScalar() ? LLT::scalar(SizeAndAction.first)
+                                 : LLT::pointer(Aspect.Type.getAddressSpace(),
+                                                SizeAndAction.first)};
+}
+
+std::pair<LegalizerInfo::LegalizeAction, LLT>
+LegalizerInfo::findVectorLegalAction(const InstrAspect &Aspect) const {
+  assert(Aspect.Type.isVector());
+  // First legalize the vector element size, then legalize the number of
+  // lanes in the vector.
+  if (Aspect.Opcode < FirstOp || Aspect.Opcode > LastOp)
+    return {NotFound, Aspect.Type};
+  const unsigned OpcodeIdx = Aspect.Opcode - FirstOp;
+  const unsigned TypeIdx = Aspect.Idx;
+  if (TypeIdx >= ScalarInVectorActions[OpcodeIdx].size())
+    return {NotFound, Aspect.Type};
+  const SizeAndActionsVec &ElemSizeVec =
+      ScalarInVectorActions[OpcodeIdx][TypeIdx];
+
+  LLT IntermediateType;
+  auto ElementSizeAndAction =
+      findAction(ElemSizeVec, Aspect.Type.getScalarSizeInBits());
+  IntermediateType =
+      LLT::vector(Aspect.Type.getNumElements(), ElementSizeAndAction.first);
+  if (ElementSizeAndAction.second != Legal)
+    return {ElementSizeAndAction.second, IntermediateType};
+
+  auto i = NumElements2Actions[OpcodeIdx].find(
+      IntermediateType.getScalarSizeInBits());
+  if (i == NumElements2Actions[OpcodeIdx].end()) {
+    return {NotFound, IntermediateType};
+  }
+  const SizeAndActionsVec &NumElementsVec = (*i).second[TypeIdx];
+  auto NumElementsAndAction =
+      findAction(NumElementsVec, IntermediateType.getNumElements());
+  return {NumElementsAndAction.second,
+          LLT::vector(NumElementsAndAction.first,
+                      IntermediateType.getScalarSizeInBits())};
 }
diff --git a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index b4fe5f15fdd..230c9ef04c5 100644
--- a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -15,8 +15,8 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DebugInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
diff --git a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
index f117c609453..3854e9b263d 100644
--- a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -19,10 +19,10 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/GlobalISel/Utils.cpp b/lib/CodeGen/GlobalISel/Utils.cpp
index a9f3d73a294..9ae0f970f42 100644
--- a/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/lib/CodeGen/GlobalISel/Utils.cpp
@@ -17,9 +17,9 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 #define DEBUG_TYPE "globalisel-utils"
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 08720d1271f..6d2a55c65f4 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -31,6 +31,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -40,7 +41,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp
index bf0f88d49a8..b1cac2a107d 100644
--- a/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/lib/CodeGen/ImplicitNullChecks.cpp
@@ -44,6 +44,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/LLVMContext.h"
@@ -51,7 +52,6 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 2e991de6221..a383b72dce6 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -41,6 +41,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/BranchProbability.h"
@@ -49,7 +50,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/LiveDebugValues.cpp b/lib/CodeGen/LiveDebugValues.cpp
index a45b1e39fee..9328e2d900a 100644
--- a/lib/CodeGen/LiveDebugValues.cpp
+++ b/lib/CodeGen/LiveDebugValues.cpp
@@ -36,6 +36,8 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
@@ -46,8 +48,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 0c81306a9a5..1923a8c2529 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -39,6 +39,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
@@ -51,7 +52,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index 92cca1a5495..61fbfdd64a2 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -16,10 +16,10 @@
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index a9aec926115..f9c5652e8a1 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -34,10 +34,10 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include <algorithm>
 using namespace llvm;
 
diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp
index 2eab0376da2..33ae476bf4a 100644
--- a/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -30,7 +30,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/MIRCanonicalizerPass.cpp b/lib/CodeGen/MIRCanonicalizerPass.cpp
new file mode 100644
index 00000000000..62596440c73
--- /dev/null
+++ b/lib/CodeGen/MIRCanonicalizerPass.cpp
@@ -0,0 +1,626 @@
+//===-------------- MIRCanonicalizer.cpp - MIR Canonicalizer --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The purpose of this pass is to employ a canonical code transformation so
+// that code compiled with slightly different IR passes can be diffed more
+// effectively than otherwise. This is done by renaming vregs in a given
+// LiveRange in a canonical way. This pass also does a pseudo-scheduling to
+// move defs closer to their use inorder to reduce diffs caused by slightly
+// different schedules.
+//
+// Basic Usage:
+//
+// llc -o - -run-pass mir-canonicalizer example.mir
+//
+// Reorders instructions canonically.
+// Renames virtual register operands canonically.
+// Strips certain MIR artifacts (optionally).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <queue>
+
+using namespace llvm;
+
+namespace llvm {
+extern char &MIRCanonicalizerID;
+} // namespace llvm
+
+#define DEBUG_TYPE "mir-canonicalizer"
+
+static cl::opt<unsigned>
+CanonicalizeFunctionNumber("canon-nth-function", cl::Hidden, cl::init(~0u),
+                           cl::value_desc("N"),
+                           cl::desc("Function number to canonicalize."));
+
+static cl::opt<unsigned>
+CanonicalizeBasicBlockNumber("canon-nth-basicblock", cl::Hidden, cl::init(~0u),
+                             cl::value_desc("N"),
+                             cl::desc("BasicBlock number to canonicalize."));
+
+namespace {
+
+class MIRCanonicalizer : public MachineFunctionPass {
+public:
+  static char ID;
+  MIRCanonicalizer() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "Rename register operands in a canonical ordering.";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // end anonymous namespace
+
+enum VRType { RSE_Reg = 0, RSE_FrameIndex, RSE_NewCandidate };
+class TypedVReg {
+  VRType type;
+  unsigned reg;
+
+public:
+  TypedVReg(unsigned reg) : type(RSE_Reg), reg(reg) {}
+  TypedVReg(VRType type) : type(type), reg(~0U) {
+    assert(type != RSE_Reg && "Expected a non-register type.");
+  }
+
+  bool isReg()        const { return type == RSE_Reg;          }
+  bool isFrameIndex() const { return type == RSE_FrameIndex;   }
+  bool isCandidate()  const { return type == RSE_NewCandidate; }
+
+  VRType getType() const { return type; }
+  unsigned getReg() const {
+    assert(this->isReg() && "Expected a virtual or physical register.");
+    return reg;
+  }
+};
+
+char MIRCanonicalizer::ID;
+
+char &llvm::MIRCanonicalizerID = MIRCanonicalizer::ID;
+
+INITIALIZE_PASS_BEGIN(MIRCanonicalizer, "mir-canonicalizer",
+                      "Rename Register Operands Canonically", false, false)
+
+INITIALIZE_PASS_END(MIRCanonicalizer, "mir-canonicalizer",
+                    "Rename Register Operands Canonically", false, false)
+
+static std::vector<MachineBasicBlock *> GetRPOList(MachineFunction &MF) {
+  ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
+  std::vector<MachineBasicBlock *> RPOList;
+  for (auto MBB : RPOT) {
+    RPOList.push_back(MBB);
+  }
+
+  return RPOList;
+}
+
+// Set a dummy vreg. We use this vregs register class to generate throw-away
+// vregs that are used to skip vreg numbers so that vreg numbers line up.
+static unsigned GetDummyVReg(const MachineFunction &MF) {
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      for (auto &MO : MI.operands()) {
+        if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+          continue;
+        return MO.getReg();
+      }
+    }
+  }
+
+  return ~0U;
+}
+
+static bool rescheduleCanonically(MachineBasicBlock *MBB) {
+
+  bool Changed = false;
+
+  // Calculates the distance of MI from the begining of its parent BB.
+  auto getInstrIdx = [](const MachineInstr &MI) {
+    unsigned i = 0;
+    for (auto &CurMI : *MI.getParent()) {
+      if (&CurMI == &MI)
+        return i;
+      i++;
+    }
+    return ~0U;
+  };
+
+  // Pre-Populate vector of instructions to reschedule so that we don't
+  // clobber the iterator.
+  std::vector<MachineInstr *> Instructions;
+  for (auto &MI : *MBB) {
+    Instructions.push_back(&MI);
+  }
+
+  for (auto *II : Instructions) {
+    if (II->getNumOperands() == 0)
+      continue;
+
+    MachineOperand &MO = II->getOperand(0);
+    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;
+
+    DEBUG(dbgs() << "Operand " << 0 << " of "; II->dump(); MO.dump(););
+
+    MachineInstr *Def = II;
+    unsigned Distance = ~0U;
+    MachineInstr *UseToBringDefCloserTo = nullptr;
+    MachineRegisterInfo *MRI = &MBB->getParent()->getRegInfo();
+    for (auto &UO : MRI->use_nodbg_operands(MO.getReg())) {
+      MachineInstr *UseInst = UO.getParent();
+
+      const unsigned DefLoc = getInstrIdx(*Def);
+      const unsigned UseLoc = getInstrIdx(*UseInst);
+      const unsigned Delta = (UseLoc - DefLoc);
+
+      if (UseInst->getParent() != Def->getParent())
+        continue;
+      if (DefLoc >= UseLoc)
+        continue;
+
+      if (Delta < Distance) {
+        Distance = Delta;
+        UseToBringDefCloserTo = UseInst;
+      }
+    }
+
+    const auto BBE = MBB->instr_end();
+    MachineBasicBlock::iterator DefI = BBE;
+    MachineBasicBlock::iterator UseI = BBE;
+
+    for (auto BBI = MBB->instr_begin(); BBI != BBE; ++BBI) {
+
+      if (DefI != BBE && UseI != BBE)
+        break;
+
+      if ((&*BBI != Def) && (&*BBI != UseToBringDefCloserTo))
+        continue;
+
+      if (&*BBI == Def) {
+        DefI = BBI;
+        continue;
+      }
+
+      if (&*BBI == UseToBringDefCloserTo) {
+        UseI = BBI;
+        continue;
+      }
+    }
+
+    if (DefI == BBE || UseI == BBE)
+      continue;
+
+    DEBUG({
+      dbgs() << "Splicing ";
+      DefI->dump();
+      dbgs() << " right before: ";
+      UseI->dump();
+    });
+
+    Changed = true;
+    MBB->splice(UseI, MBB, DefI);
+  }
+
+  return Changed;
+}
+
+/// Here we find our candidates. What makes an interesting candidate?
+/// An candidate for a canonicalization tree root is normally any kind of
+/// instruction that causes side effects such as a store to memory or a copy to
+/// a physical register or a return instruction. We use these as an expression
+/// tree root that we walk inorder to build a canonical walk which should result
+/// in canoncal vreg renaming.
+static std::vector<MachineInstr *> populateCandidates(MachineBasicBlock *MBB) {
+  std::vector<MachineInstr *> Candidates;
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+  for (auto II = MBB->begin(), IE = MBB->end(); II != IE; ++II) {
+    MachineInstr *MI = &*II;
+
+    bool DoesMISideEffect = false;
+
+    if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg()) {
+      const unsigned Dst = MI->getOperand(0).getReg();
+      DoesMISideEffect |= !TargetRegisterInfo::isVirtualRegister(Dst);
+
+      for (auto UI = MRI.use_begin(Dst); UI != MRI.use_end(); ++UI) {
+        if (DoesMISideEffect) break;
+        DoesMISideEffect |= (UI->getParent()->getParent() != MI->getParent());
+      }
+    }
+
+    if (!MI->mayStore() && !MI->isBranch() && !DoesMISideEffect)
+      continue;
+
+    DEBUG(dbgs() << "Found Candidate:  "; MI->dump(););
+    Candidates.push_back(MI);
+  }
+
+  return Candidates;
+}
+
+void doCandidateWalk(std::vector<TypedVReg> &VRegs,
+                     std::queue <TypedVReg> &RegQueue,
+                     std::vector<MachineInstr *> &VisitedMIs,
+                     const MachineBasicBlock *MBB) {
+
+  const MachineFunction &MF = *MBB->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  while (!RegQueue.empty()) {
+
+    auto TReg = RegQueue.front();
+    RegQueue.pop();
+
+    if (TReg.isFrameIndex()) {
+      DEBUG(dbgs() << "Popping frame index.\n";);
+      VRegs.push_back(TypedVReg(RSE_FrameIndex));
+      continue;
+    }
+
+    assert(TReg.isReg() && "Expected vreg or physreg.");
+    unsigned Reg = TReg.getReg();
+
+    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      DEBUG({
+        dbgs() << "Popping vreg ";
+        MRI.def_begin(Reg)->dump();
+        dbgs() << "\n";
+      });
+
+      if (!llvm::any_of(VRegs, [&](const TypedVReg &TR) {
+            return TR.isReg() && TR.getReg() == Reg;
+          })) {
+        VRegs.push_back(TypedVReg(Reg));
+      }
+    } else {
+      DEBUG(dbgs() << "Popping physreg.\n";);
+      VRegs.push_back(TypedVReg(Reg));
+      continue;
+    }
+
+    for (auto RI = MRI.def_begin(Reg), RE = MRI.def_end(); RI != RE; ++RI) {
+      MachineInstr *Def = RI->getParent();
+
+      if (Def->getParent() != MBB)
+        continue;
+
+      if (llvm::any_of(VisitedMIs,
+                       [&](const MachineInstr *VMI) { return Def == VMI; })) {
+        break;
+      }
+
+      DEBUG({
+        dbgs() << "\n========================\n";
+        dbgs() << "Visited MI: ";
+        Def->dump();
+        dbgs() << "BB Name: " << Def->getParent()->getName() << "\n";
+        dbgs() << "\n========================\n";
+      });
+      VisitedMIs.push_back(Def);
+      for (unsigned I = 1, E = Def->getNumOperands(); I != E; ++I) {
+
+        MachineOperand &MO = Def->getOperand(I);
+        if (MO.isFI()) {
+          DEBUG(dbgs() << "Pushing frame index.\n";);
+          RegQueue.push(TypedVReg(RSE_FrameIndex));
+        }
+
+        if (!MO.isReg())
+          continue;
+        RegQueue.push(TypedVReg(MO.getReg()));
+      }
+    }
+  }
+}
+
+// TODO: Work to remove this in the future. One day when we have named vregs
+// we should be able to form the canonical name based on some characteristic
+// we see in that point of the expression tree (like if we were to name based
+// on some sort of value numbering scheme).
+static void SkipVRegs(unsigned &VRegGapIndex, MachineRegisterInfo &MRI,
+                      const TargetRegisterClass *RC) {
+  const unsigned VR_GAP = (++VRegGapIndex * 1000);
+
+  DEBUG({
+    dbgs() << "Adjusting per-BB VR_GAP for BB" << VRegGapIndex << " to "
+           << VR_GAP << "\n";
+  });
+
+  unsigned I = MRI.createVirtualRegister(RC);
+  const unsigned E = (((I + VR_GAP) / VR_GAP) + 1) * VR_GAP;
+  while (I != E) {
+    I = MRI.createVirtualRegister(RC);
+  }
+}
+
+static std::map<unsigned, unsigned>
+GetVRegRenameMap(const std::vector<TypedVReg> &VRegs,
+                 const std::vector<unsigned> &renamedInOtherBB,
+                 MachineRegisterInfo &MRI,
+                 const TargetRegisterClass *RC) {
+  std::map<unsigned, unsigned> VRegRenameMap;
+  unsigned LastRenameReg = MRI.createVirtualRegister(RC);
+  bool FirstCandidate = true;
+
+  for (auto &vreg : VRegs) {
+    if (vreg.isFrameIndex()) {
+      // We skip one vreg for any frame index because there is a good chance
+      // (especially when comparing SelectionDAG to GlobalISel generated MIR)
+      // that in the other file we are just getting an incoming vreg that comes
+      // from a copy from a frame index. So it's safe to skip by one.
+      LastRenameReg = MRI.createVirtualRegister(RC);
+      DEBUG(dbgs() << "Skipping rename for FI " << LastRenameReg << "\n";);
+      continue;
+    } else if (vreg.isCandidate()) {
+
+      // After the first candidate, for every subsequent candidate, we skip mod
+      // 10 registers so that the candidates are more likely to start at the
+      // same vreg number making it more likely that the canonical walk from the
+      // candidate insruction. We don't need to skip from the first candidate of
+      // the BasicBlock because we already skip ahead several vregs for each BB.
+      while (LastRenameReg % 10) {
+        if (!FirstCandidate) break;
+        LastRenameReg = MRI.createVirtualRegister(RC);
+
+        DEBUG({
+          dbgs() << "Skipping rename for new candidate " << LastRenameReg
+                 << "\n";
+        });
+      }
+      FirstCandidate = false;
+      continue;
+    } else if (!TargetRegisterInfo::isVirtualRegister(vreg.getReg())) {
+      LastRenameReg = MRI.createVirtualRegister(RC);
+      DEBUG({
+        dbgs() << "Skipping rename for Phys Reg " << LastRenameReg << "\n";
+      });
+      continue;
+    }
+
+    auto Reg = vreg.getReg();
+    if (llvm::find(renamedInOtherBB, Reg) != renamedInOtherBB.end()) {
+      DEBUG(dbgs() << "Vreg " << Reg << " already renamed in other BB.\n";);
+      continue;
+    }
+
+    auto Rename = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+    LastRenameReg = Rename;
+
+    if (VRegRenameMap.find(Reg) == VRegRenameMap.end()) {
+      DEBUG(dbgs() << "Mapping vreg ";);
+      if (MRI.reg_begin(Reg) != MRI.reg_end()) {
+        DEBUG(auto foo = &*MRI.reg_begin(Reg); foo->dump(););
+      } else {
+        DEBUG(dbgs() << Reg;);
+      }
+      DEBUG(dbgs() << " to ";);
+      if (MRI.reg_begin(Rename) != MRI.reg_end()) {
+        DEBUG(auto foo = &*MRI.reg_begin(Rename); foo->dump(););
+      } else {
+        DEBUG(dbgs() << Rename;);
+      }
+      DEBUG(dbgs() << "\n";);
+
+      VRegRenameMap.insert(std::pair<unsigned, unsigned>(Reg, Rename));
+    }
+  }
+
+  return VRegRenameMap;
+}
+
+static bool doVRegRenaming(std::vector<unsigned> &RenamedInOtherBB,
+                           const std::map<unsigned, unsigned> &VRegRenameMap,
+                           MachineRegisterInfo &MRI) {
+  bool Changed = false;
+  for (auto I = VRegRenameMap.begin(), E = VRegRenameMap.end(); I != E; ++I) {
+
+    auto VReg = I->first;
+    auto Rename = I->second;
+
+    RenamedInOtherBB.push_back(Rename);
+
+    std::vector<MachineOperand *> RenameMOs;
+    for (auto &MO : MRI.reg_operands(VReg)) {
+      RenameMOs.push_back(&MO);
+    }
+
+    for (auto *MO : RenameMOs) {
+      Changed = true;
+      MO->setReg(Rename);
+
+      if (!MO->isDef())
+        MO->setIsKill(false);
+    }
+  }
+
+  return Changed;
+}
+
+static bool doDefKillClear(MachineBasicBlock *MBB) {
+  bool Changed = false;
+
+  for (auto &MI : *MBB) {
+    for (auto &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      if (!MO.isDef() && MO.isKill()) {
+        Changed = true;
+        MO.setIsKill(false);
+      }
+
+      if (MO.isDef() && MO.isDead()) {
+        Changed = true;
+        MO.setIsDead(false);
+      }
+    }
+  }
+
+  return Changed;
+}
+
+static bool runOnBasicBlock(MachineBasicBlock *MBB,
+                            std::vector<StringRef> &bbNames,
+                            std::vector<unsigned> &renamedInOtherBB,
+                            unsigned &basicBlockNum, unsigned &VRegGapIndex) {
+
+  if (CanonicalizeBasicBlockNumber != ~0U) {
+    if (CanonicalizeBasicBlockNumber != basicBlockNum++)
+      return false;
+    DEBUG(dbgs() << "\n Canonicalizing BasicBlock " << MBB->getName() << "\n";);
+  }
+
+  if (llvm::find(bbNames, MBB->getName()) != bbNames.end()) {
+    DEBUG({
+      dbgs() << "Found potentially duplicate BasicBlocks: " << MBB->getName()
+             << "\n";
+    });
+    return false;
+  }
+
+  DEBUG({
+    dbgs() << "\n\n  NEW BASIC BLOCK: " << MBB->getName() << "  \n\n";
+    dbgs() << "\n\n================================================\n\n";
+  });
+
+  bool Changed = false;
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  const unsigned DummyVReg = GetDummyVReg(MF);
+  const TargetRegisterClass *DummyRC =
+    (DummyVReg == ~0U) ? nullptr : MRI.getRegClass(DummyVReg);
+  if (!DummyRC) return false;
+
+  bbNames.push_back(MBB->getName());
+  DEBUG(dbgs() << "\n\n NEW BASIC BLOCK: " << MBB->getName() << "\n\n";);
+
+  DEBUG(dbgs() << "MBB Before Scheduling:\n"; MBB->dump(););
+  Changed |= rescheduleCanonically(MBB);
+  DEBUG(dbgs() << "MBB After Scheduling:\n"; MBB->dump(););
+
+  std::vector<MachineInstr *> Candidates = populateCandidates(MBB);
+  std::vector<MachineInstr *> VisitedMIs;
+  std::copy(Candidates.begin(), Candidates.end(),
+            std::back_inserter(VisitedMIs));
+
+  std::vector<TypedVReg> VRegs;
+  for (auto candidate : Candidates) {
+    VRegs.push_back(TypedVReg(RSE_NewCandidate));
+
+    std::queue<TypedVReg> RegQueue;
+
+    // Here we walk the vreg operands of a non-root node along our walk.
+    // The root nodes are the original candidates (stores normally).
+    // These are normally not the root nodes (except for the case of copies to
+    // physical registers).
+    for (unsigned i = 1; i < candidate->getNumOperands(); i++) {
+      if (candidate->mayStore() || candidate->isBranch())
+        break;
+
+      MachineOperand &MO = candidate->getOperand(i);
+      if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
+        continue;
+
+      DEBUG(dbgs() << "Enqueue register"; MO.dump(); dbgs() << "\n";);
+      RegQueue.push(TypedVReg(MO.getReg()));
+    }
+
+    // Here we walk the root candidates. We start from the 0th operand because
+    // the root is normally a store to a vreg.
+    for (unsigned i = 0; i < candidate->getNumOperands(); i++) {
+
+      if (!candidate->mayStore() && !candidate->isBranch())
+        break;
+
+      MachineOperand &MO = candidate->getOperand(i);
+
+      // TODO: Do we want to only add vregs here?
+      if (!MO.isReg() && !MO.isFI())
+        continue;
+
+      DEBUG(dbgs() << "Enqueue Reg/FI"; MO.dump(); dbgs() << "\n";);
+
+      RegQueue.push(MO.isReg() ? TypedVReg(MO.getReg()) :
+                                  TypedVReg(RSE_FrameIndex));
+    }
+
+    doCandidateWalk(VRegs, RegQueue, VisitedMIs, MBB);
+  }
+
+  // If we have populated no vregs to rename then bail.
+  // The rest of this function does the vreg remaping.
+  if (VRegs.size() == 0)
+    return Changed;
+
+  // Skip some vregs, so we can recon where we'll land next.
+  SkipVRegs(VRegGapIndex, MRI, DummyRC);
+
+  auto VRegRenameMap = GetVRegRenameMap(VRegs, renamedInOtherBB, MRI, DummyRC);
+  Changed |= doVRegRenaming(renamedInOtherBB, VRegRenameMap, MRI);
+  Changed |= doDefKillClear(MBB);
+
+  DEBUG(dbgs() << "Updated MachineBasicBlock:\n"; MBB->dump(); dbgs() << "\n";);
+  DEBUG(dbgs() << "\n\n================================================\n\n");
+  return Changed;
+}
+
+bool MIRCanonicalizer::runOnMachineFunction(MachineFunction &MF) {
+
+  static unsigned functionNum = 0;
+  if (CanonicalizeFunctionNumber != ~0U) {
+    if (CanonicalizeFunctionNumber != functionNum++)
+      return false;
+    DEBUG(dbgs() << "\n Canonicalizing Function " << MF.getName() << "\n";);
+  }
+
+  // we need a valid vreg to create a vreg type for skipping all those
+  // stray vreg numbers so reach alignment/canonical vreg values.
+  std::vector<MachineBasicBlock*> RPOList = GetRPOList(MF);
+
+  DEBUG(
+    dbgs() << "\n\n  NEW MACHINE FUNCTION: " << MF.getName() << "  \n\n";
+    dbgs() << "\n\n================================================\n\n";
+    dbgs() << "Total Basic Blocks: " << RPOList.size() << "\n";
+    for (auto MBB : RPOList) {
+      dbgs() << MBB->getName() << "\n";
+    }
+    dbgs() << "\n\n================================================\n\n";
+  );
+
+  std::vector<StringRef> BBNames;
+  std::vector<unsigned> RenamedInOtherBB;
+
+  unsigned GapIdx = 0;
+  unsigned BBNum = 0;
+
+  bool Changed = false;
+
+  for (auto MBB : RPOList)
+    Changed |= runOnBasicBlock(MBB, BBNames, RenamedInOtherBB, BBNum, GapIdx);
+
+  return Changed;
+}
+
diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index 9c8743a7164..fbba60c4312 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MILexer.h"
 #include "MIParser.h"
+#include "MILexer.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -21,8 +21,8 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/AsmParser/SlotMapping.h"
@@ -36,6 +36,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -64,7 +65,6 @@
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index aae48587c5b..a817c62c985 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -12,16 +12,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -31,19 +33,18 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MIRPrinter.h"
-#include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/IR/Value.h"
@@ -57,9 +58,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/LowLevelTypeImpl.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/YAMLTraits.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -162,8 +162,8 @@ public:
   void printStackObjectReference(int FrameIndex);
   void printOffset(int64_t Offset);
   void printTargetFlags(const MachineOperand &Op);
-  void print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
-             unsigned I, bool ShouldPrintRegisterTies,
+  void print(const MachineInstr &MI, unsigned OpIdx,
+             const TargetRegisterInfo *TRI, bool ShouldPrintRegisterTies,
              LLT TypeToPrint, bool IsDef = false);
   void print(const LLVMContext &Context, const TargetInstrInfo &TII,
              const MachineMemOperand &Op);
@@ -734,7 +734,7 @@ void MIPrinter::print(const MachineInstr &MI) {
        ++I) {
     if (I)
       OS << ", ";
-    print(MI.getOperand(I), TRI, I, ShouldPrintRegisterTies,
+    print(MI, I, TRI, ShouldPrintRegisterTies,
           getTypeToPrint(MI, I, PrintedTypes, MRI),
           /*IsDef=*/true);
   }
@@ -751,7 +751,7 @@ void MIPrinter::print(const MachineInstr &MI) {
   for (; I < E; ++I) {
     if (NeedComma)
       OS << ", ";
-    print(MI.getOperand(I), TRI, I, ShouldPrintRegisterTies,
+    print(MI, I, TRI, ShouldPrintRegisterTies,
           getTypeToPrint(MI, I, PrintedTypes, MRI));
     NeedComma = true;
   }
@@ -923,9 +923,11 @@ static const char *getTargetIndexName(const MachineFunction &MF, int Index) {
   return nullptr;
 }
 
-void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
-                      unsigned I, bool ShouldPrintRegisterTies, LLT TypeToPrint,
+void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
+                      const TargetRegisterInfo *TRI,
+                      bool ShouldPrintRegisterTies, LLT TypeToPrint,
                       bool IsDef) {
+  const MachineOperand &Op = MI.getOperand(OpIdx);
   printTargetFlags(Op);
   switch (Op.getType()) {
   case MachineOperand::MO_Register: {
@@ -959,13 +961,16 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
       }
     }
     if (ShouldPrintRegisterTies && Op.isTied() && !Op.isDef())
-      OS << "(tied-def " << Op.getParent()->findTiedOperandIdx(I) << ")";
+      OS << "(tied-def " << Op.getParent()->findTiedOperandIdx(OpIdx) << ")";
     if (TypeToPrint.isValid())
       OS << '(' << TypeToPrint << ')';
     break;
   }
   case MachineOperand::MO_Immediate:
-    OS << Op.getImm();
+    if (MI.isOperandSubregIdx(OpIdx))
+      OS << "%subreg." << TRI->getSubRegIndexName(Op.getImm());
+    else
+      OS << Op.getImm();
     break;
   case MachineOperand::MO_CImmediate:
     Op.getCImm()->printAsOperand(OS, /*PrintType=*/true, MST);
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index d5758da0464..40ffbc46556 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -30,7 +31,6 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -42,6 +42,8 @@ using namespace llvm;
 MachineBasicBlock::MachineBasicBlock(MachineFunction &MF, const BasicBlock *B)
     : BB(B), Number(-1), xParent(&MF) {
   Insts.Parent = this;
+  if (B)
+    IrrLoopHeaderWeight = B->getIrrLoopHeaderWeight();
 }
 
 MachineBasicBlock::~MachineBasicBlock() {
@@ -338,6 +340,12 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     }
     OS << '\n';
   }
+  if (IrrLoopHeaderWeight) {
+    if (Indexes) OS << '\t';
+    OS << "    Irreducible loop header weight: "
+       << IrrLoopHeaderWeight.getValue();
+    OS << '\n';
+  }
 }
 
 void MachineBasicBlock::printAsOperand(raw_ostream &OS,
diff --git a/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index 14cd91206d8..2c336e45056 100644
--- a/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -234,6 +234,12 @@ MachineBlockFrequencyInfo::getProfileCountFromFreq(uint64_t Freq) const {
   return MBFI ? MBFI->getProfileCountFromFreq(*F, Freq) : None;
 }
 
+bool
+MachineBlockFrequencyInfo::isIrrLoopHeader(const MachineBasicBlock *MBB) {
+  assert(MBFI && "Expected analysis to be available");
+  return MBFI->isIrrLoopHeader(MBB);
+}
+
 const MachineFunction *MachineBlockFrequencyInfo::getFunction() const {
   return MBFI ? MBFI->getFunction() : nullptr;
 }
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index c5991332f08..9fa4c0d5e4f 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -43,6 +43,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/TailDuplicator.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
@@ -55,7 +56,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index f0f63715d2f..be197a48d80 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
@@ -35,7 +36,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/RecyclingAllocator.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp
index 9fc990f5c24..5b576b68fcc 100644
--- a/lib/CodeGen/MachineCombiner.cpp
+++ b/lib/CodeGen/MachineCombiner.cpp
@@ -21,11 +21,11 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineTraceMetrics.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index 61f56fffc88..1a39afe655e 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp
@@ -23,11 +23,11 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <cassert>
diff --git a/lib/CodeGen/MachineFrameInfo.cpp b/lib/CodeGen/MachineFrameInfo.cpp
index be8adf75fb7..75c05e2f002 100644
--- a/lib/CodeGen/MachineFrameInfo.cpp
+++ b/lib/CodeGen/MachineFrameInfo.cpp
@@ -16,10 +16,10 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <cassert>
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 250a10c7d07..570c410e1fe 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -58,7 +58,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index bb2dda980e4..2c81218f8f6 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -33,6 +33,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
@@ -58,7 +59,6 @@
 #include "llvm/Support/LowLevelTypeImpl.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -320,8 +320,45 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
   }
   case MachineOperand::MO_MCSymbol:
     return getMCSymbol() == Other.getMCSymbol();
-  case MachineOperand::MO_CFIIndex:
-    return getCFIIndex() == Other.getCFIIndex();
+  case MachineOperand::MO_CFIIndex: {
+    const MachineFunction *MF = getParent()->getParent()->getParent();
+    const MachineFunction *OtherMF =
+        Other.getParent()->getParent()->getParent();
+    MCCFIInstruction Inst = MF->getFrameInstructions()[getCFIIndex()];
+    MCCFIInstruction OtherInst =
+        OtherMF->getFrameInstructions()[Other.getCFIIndex()];
+    MCCFIInstruction::OpType op = Inst.getOperation();
+    if (op != OtherInst.getOperation()) return false;
+    switch (op) {
+    case MCCFIInstruction::OpDefCfa:
+    case MCCFIInstruction::OpOffset:
+    case MCCFIInstruction::OpRelOffset:
+      if (Inst.getRegister() != OtherInst.getRegister()) return false;
+      if (Inst.getOffset() != OtherInst.getOffset()) return false;
+      break;
+    case MCCFIInstruction::OpRestore:
+    case MCCFIInstruction::OpUndefined:
+    case MCCFIInstruction::OpSameValue:
+    case MCCFIInstruction::OpDefCfaRegister:
+      if (Inst.getRegister() != OtherInst.getRegister()) return false;
+      break;
+    case MCCFIInstruction::OpRegister:
+      if (Inst.getRegister() != OtherInst.getRegister()) return false;
+      if (Inst.getRegister2() != OtherInst.getRegister2()) return false;
+      break;
+    case MCCFIInstruction::OpDefCfaOffset:
+    case MCCFIInstruction::OpAdjustCfaOffset:
+    case MCCFIInstruction::OpGnuArgsSize:
+      if (Inst.getOffset() != OtherInst.getOffset()) return false;
+      break;
+    case MCCFIInstruction::OpRememberState:
+    case MCCFIInstruction::OpRestoreState:
+    case MCCFIInstruction::OpEscape:
+    case MCCFIInstruction::OpWindowSave:
+      break;
+    }
+    return true;
+  }
   case MachineOperand::MO_Metadata:
     return getMetadata() == Other.getMetadata();
   case MachineOperand::MO_IntrinsicID:
@@ -370,8 +407,13 @@ hash_code llvm::hash_value(const MachineOperand &MO) {
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMetadata());
   case MachineOperand::MO_MCSymbol:
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMCSymbol());
-  case MachineOperand::MO_CFIIndex:
-    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getCFIIndex());
+  case MachineOperand::MO_CFIIndex: {
+    const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+    MCCFIInstruction Inst = MF->getFrameInstructions()[MO.getCFIIndex()];
+    return hash_combine(MO.getType(), MO.getTargetFlags(), Inst.getOperation(),
+                        Inst.getRegister(), Inst.getRegister2(),
+                        Inst.getOffset());
+  }
   case MachineOperand::MO_IntrinsicID:
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIntrinsicID());
   case MachineOperand::MO_Predicate:
diff --git a/lib/CodeGen/MachineInstrBundle.cpp b/lib/CodeGen/MachineInstrBundle.cpp
index b5621a09c6f..eceae9a968b 100644
--- a/lib/CodeGen/MachineInstrBundle.cpp
+++ b/lib/CodeGen/MachineInstrBundle.cpp
@@ -13,7 +13,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index efb5c3371de..d1147fea08e 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -34,6 +34,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -43,7 +44,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
index 1bc869e02e6..48b68e3b718 100644
--- a/lib/CodeGen/MachineOutliner.cpp
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -65,11 +65,11 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index c852c2e1564..d270b8e5d8f 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -89,6 +89,7 @@
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
@@ -102,7 +103,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index be06053f004..1674aba0c82 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
@@ -28,7 +29,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <cassert>
diff --git a/lib/CodeGen/MachineSSAUpdater.cpp b/lib/CodeGen/MachineSSAUpdater.cpp
index 65d82366767..a3e93de67bb 100644
--- a/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/lib/CodeGen/MachineSSAUpdater.cpp
@@ -21,11 +21,11 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 3e12bdcd689..900a0a63e96 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -42,6 +42,7 @@
 #include "llvm/CodeGen/ScheduleDFS.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/MC/LaneBitmask.h"
@@ -52,7 +53,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index dd6e26d8f8b..f52e3942664 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -33,6 +33,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Pass.h"
@@ -40,7 +41,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 0bd5c56871c..f894f470445 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -51,6 +51,7 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InlineAsm.h"
@@ -66,7 +67,6 @@
 #include "llvm/Support/LowLevelTypeImpl.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/MacroFusion.cpp b/lib/CodeGen/MacroFusion.cpp
index 633a853b2c7..13ddad59382 100644
--- a/lib/CodeGen/MacroFusion.cpp
+++ b/lib/CodeGen/MacroFusion.cpp
@@ -19,10 +19,10 @@
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 #define DEBUG_TYPE "machine-scheduler"
 
diff --git a/lib/CodeGen/OptimizePHIs.cpp b/lib/CodeGen/OptimizePHIs.cpp
index 6430e54a59c..530040a3332 100644
--- a/lib/CodeGen/OptimizePHIs.cpp
+++ b/lib/CodeGen/OptimizePHIs.cpp
@@ -20,8 +20,8 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Pass.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <cassert>
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index c7f0329b3c5..af26c170cb8 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -31,11 +31,11 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/PatchableFunction.cpp b/lib/CodeGen/PatchableFunction.cpp
index 513e8271656..84dbebfd2b6 100644
--- a/lib/CodeGen/PatchableFunction.cpp
+++ b/lib/CodeGen/PatchableFunction.cpp
@@ -16,8 +16,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index 7cff85a3ab0..c267018c4d1 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -81,6 +81,7 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Pass.h"
@@ -88,7 +89,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/PostRAHazardRecognizer.cpp b/lib/CodeGen/PostRAHazardRecognizer.cpp
index 4a50d895340..9770b336da6 100644
--- a/lib/CodeGen/PostRAHazardRecognizer.cpp
+++ b/lib/CodeGen/PostRAHazardRecognizer.cpp
@@ -31,10 +31,10 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index f2249f9e37e..fd92609613b 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -34,12 +34,12 @@
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index 0118580a626..feab36d3995 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -13,9 +13,9 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index d9e9b3360a0..e41ffc244d9 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -39,6 +39,8 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
@@ -55,8 +57,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetOptions.h"
@@ -76,12 +76,6 @@ using namespace llvm;
 
 using MBBVector = SmallVector<MachineBasicBlock *, 4>;
 
-static void spillCalleeSavedRegs(MachineFunction &MF, RegScavenger *RS,
-                                 unsigned &MinCSFrameIndex,
-                                 unsigned &MaxCXFrameIndex,
-                                 const MBBVector &SaveBlocks,
-                                 const MBBVector &RestoreBlocks);
-
 namespace {
 
 class PEI : public MachineFunctionPass {
@@ -125,6 +119,7 @@ private:
 
   void calculateCallFrameInfo(MachineFunction &Fn);
   void calculateSaveRestoreBlocks(MachineFunction &Fn);
+  void spillCalleeSavedRegs(MachineFunction &MF);
 
   void calculateFrameObjectOffsets(MachineFunction &Fn);
   void replaceFrameIndices(MachineFunction &Fn);
@@ -197,8 +192,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
 
   // Handle CSR spilling and restoring, for targets that need it.
   if (Fn.getTarget().usesPhysRegsForPEI())
-    spillCalleeSavedRegs(Fn, RS, MinCSFrameIndex, MaxCSFrameIndex, SaveBlocks,
-                         RestoreBlocks);
+    spillCalleeSavedRegs(Fn);
 
   // Allow the target machine to make final modifications to the function
   // before the frame layout is finalized.
@@ -505,11 +499,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
   }
 }
 
-static void spillCalleeSavedRegs(MachineFunction &Fn, RegScavenger *RS,
-                                 unsigned &MinCSFrameIndex,
-                                 unsigned &MaxCSFrameIndex,
-                                 const MBBVector &SaveBlocks,
-                                 const MBBVector &RestoreBlocks) {
+void PEI::spillCalleeSavedRegs(MachineFunction &Fn) {
   // We can't list this requirement in getRequiredProperties because some
   // targets (WebAssembly) use virtual registers past this point, and the pass
   // pipeline is set up without giving the passes a chance to look at the
diff --git a/lib/CodeGen/PseudoSourceValue.cpp b/lib/CodeGen/PseudoSourceValue.cpp
index 5fa5587457d..86fd8745052 100644
--- a/lib/CodeGen/PseudoSourceValue.cpp
+++ b/lib/CodeGen/PseudoSourceValue.cpp
@@ -14,7 +14,7 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 7061c3ff652..19467ae3b72 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -30,6 +30,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Metadata.h"
@@ -41,7 +42,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index e74ac79f001..23e5f907c88 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -54,6 +54,7 @@
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
@@ -66,7 +67,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/RegUsageInfoCollector.cpp b/lib/CodeGen/RegUsageInfoCollector.cpp
index 214c6d2c820..3aaa5a4738d 100644
--- a/lib/CodeGen/RegUsageInfoCollector.cpp
+++ b/lib/CodeGen/RegUsageInfoCollector.cpp
@@ -27,7 +27,7 @@
 #include "llvm/CodeGen/RegisterUsageInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 using namespace llvm;
 
diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
index 956dec39fc3..8e463ff272d 100644
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@@ -24,7 +24,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 1ef7e41b8ae..84c2e2548ec 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -35,17 +35,17 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DebugLoc.h"
-#include "llvm/Pass.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 844ddb9ed3f..18fe2d85954 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -28,13 +28,13 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
diff --git a/lib/CodeGen/RenameIndependentSubregs.cpp b/lib/CodeGen/RenameIndependentSubregs.cpp
index bd5ecbd28f2..19e0f30ecfc 100644
--- a/lib/CodeGen/RenameIndependentSubregs.cpp
+++ b/lib/CodeGen/RenameIndependentSubregs.cpp
@@ -35,7 +35,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 5e95f760aaa..627bc1946f3 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -19,11 +19,11 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index e2cb8cad6e1..b789e2d9c52 100644
--- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -15,12 +15,12 @@
 
 #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include <cassert>
 
 using namespace llvm;
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 491c56a7314..6e245feb735 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -63,6 +63,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
@@ -98,7 +99,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index b736037d71d..696855f8018 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -19,6 +19,8 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -32,8 +34,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 7a123e3e92e..2760c6cbf86 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -21,12 +21,12 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index ff49134f7b9..356f2585046 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -40,7 +40,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index b42edf8e751..0d85bccdeac 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -849,13 +849,14 @@ static void transferDbgValues(SelectionDAG &DAG, SDValue From, SDValue To,
       break;
 
     DIVariable *Var = Dbg->getVariable();
-    auto *Fragment = DIExpression::createFragmentExpression(
-        Dbg->getExpression(), OffsetInBits, To.getValueSizeInBits());
-    SDDbgValue *Clone =
-        DAG.getDbgValue(Var, Fragment, ToNode, To.getResNo(), Dbg->isIndirect(),
-                        Dbg->getDebugLoc(), Dbg->getOrder());
+    if (auto Fragment = DIExpression::createFragmentExpression(
+            Dbg->getExpression(), OffsetInBits, To.getValueSizeInBits())) {
+      SDDbgValue *Clone = DAG.getDbgValue(Var, *Fragment, ToNode, To.getResNo(),
+                                          Dbg->isIndirect(), Dbg->getDebugLoc(),
+                                          Dbg->getOrder());
+      ClonedDVs.push_back(Clone);
+    }
     Dbg->setIsInvalidated();
-    ClonedDVs.push_back(Clone);
   }
 
   for (SDDbgValue *Dbg : ClonedDVs)
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5d6c4998ecd..b55414b51b8 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3844,7 +3844,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     }
 
     LdOps.push_back(L);
-
+    LdOp = L;
 
     LdWidth -= NewVTWidth;
   }
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 13799409327..87dcf6e6950 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -18,12 +18,12 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 98202925629..24d9d376de2 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -32,6 +32,7 @@
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -42,7 +43,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 7ddb0dc07fd..2e1abbe8bb2 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -23,11 +23,11 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
index 631cb34717c..c25315f2983 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
@@ -25,11 +25,11 @@
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <climits>
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index e5de280508b..2fb2615b072 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2893,11 +2893,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
   }
   case ISD::FrameIndex:
   case ISD::TargetFrameIndex:
-    if (unsigned Align = InferPtrAlignment(Op)) {
-      // The low bits are known zero if the pointer is aligned.
-      Known.Zero.setLowBits(Log2_32(Align));
-      break;
-    }
+    TLI->computeKnownBitsForFrameIndex(Op, Known, DemandedElts, *this, Depth);
     break;
 
   default:
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ccc06fa3ee1..f4f8879b5d8 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -55,6 +55,8 @@
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/Argument.h"
@@ -98,8 +100,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
@@ -2585,7 +2585,7 @@ static bool isVectorReductionOp(const User *I) {
   case Instruction::FAdd:
   case Instruction::FMul:
     if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
-      if (FPOp->getFastMathFlags().unsafeAlgebra())
+      if (FPOp->getFastMathFlags().isFast())
         break;
     LLVM_FALLTHROUGH;
   default:
@@ -2631,7 +2631,7 @@ static bool isVectorReductionOp(const User *I) {
 
       if (Inst->getOpcode() == OpCode || isa<PHINode>(U)) {
         if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
-          if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().unsafeAlgebra())
+          if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().isFast())
             return false;
         UsersToVisit.push_back(U);
       } else if (const ShuffleVectorInst *ShufInst =
@@ -2725,7 +2725,7 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
   Flags.setNoInfs(FMF.noInfs());
   Flags.setNoNaNs(FMF.noNaNs());
   Flags.setNoSignedZeros(FMF.noSignedZeros());
-  Flags.setUnsafeAlgebra(FMF.unsafeAlgebra());
+  Flags.setUnsafeAlgebra(FMF.isFast());
 
   SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(),
                                      Op1, Op2, Flags);
@@ -3862,7 +3862,7 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
 //
 // When the first GEP operand is a single pointer - it is the uniform base we
 // are looking for. If first operand of the GEP is a splat vector - we
-// extract the spalt value and use it as a uniform base.
+// extract the splat value and use it as a uniform base.
 // In all other cases the function returns 'false'.
 static bool getUniformBase(const Value* &Ptr, SDValue& Base, SDValue& Index,
                            SelectionDAGBuilder* SDB) {
@@ -4828,12 +4828,6 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
 
-  // Ignore inlined function arguments here.
-  //
-  // FIXME: Should we be checking DL->inlinedAt() to determine this?
-  if (!Variable->getScope()->getSubprogram()->describes(MF.getFunction()))
-    return false;
-
   bool IsIndirect = false;
   Optional<MachineOperand> Op;
   // Some arguments' frame index is recorded during argument lowering.
@@ -4873,11 +4867,13 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
           for (unsigned E = I + RegCount; I != E; ++I) {
             // The vregs are guaranteed to be allocated in sequence.
             Op = MachineOperand::CreateReg(VMI->second + I, false);
-            auto *FragmentExpr = DIExpression::createFragmentExpression(
+            auto FragmentExpr = DIExpression::createFragmentExpression(
                 Expr, Offset, RegisterSize);
+            if (!FragmentExpr)
+              continue;
             FuncInfo.ArgDbgValues.push_back(
                 BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsDbgDeclare,
-                        Op->getReg(), Variable, FragmentExpr));
+                        Op->getReg(), Variable, *FragmentExpr));
             Offset += RegisterSize;
           }
         }
@@ -7959,13 +7955,13 @@ void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
 
   switch (Intrinsic) {
   case Intrinsic::experimental_vector_reduce_fadd:
-    if (FMF.unsafeAlgebra())
+    if (FMF.isFast())
       Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2);
     else
       Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2);
     break;
   case Intrinsic::experimental_vector_reduce_fmul:
-    if (FMF.unsafeAlgebra())
+    if (FMF.isFast())
       Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2);
     else
       Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 1550347f006..1097fd92ede 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -38,7 +39,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Printable.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 4c4d196427e..ae5eebd9545 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/SelectionDAGISel.h"
 #include "ScheduleDAGSDNodes.h"
 #include "SelectionDAGBuilder.h"
 #include "llvm/ADT/APInt.h"
@@ -45,9 +46,9 @@
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -80,7 +81,6 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index fe553bc986a..1f6fafb039e 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1288,6 +1288,19 @@ void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   Known.resetAll();
 }
 
+void TargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
+                                                   KnownBits &Known,
+                                                   const APInt &DemandedElts,
+                                                   const SelectionDAG &DAG,
+                                                   unsigned Depth) const {
+  assert(isa<FrameIndexSDNode>(Op) && "expected FrameIndex");
+
+  if (unsigned Align = DAG.InferPtrAlignment(Op)) {
+    // The low bits are known zero if the pointer is aligned.
+    Known.Zero.setLowBits(Log2_32(Align));
+  }
+}
+
 /// This method can be implemented by targets that want to expose additional
 /// information about sign bits to the DAG Combiner.
 unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
diff --git a/lib/CodeGen/ShrinkWrap.cpp b/lib/CodeGen/ShrinkWrap.cpp
index 5fb6afee88a..3230aff5ed8 100644
--- a/lib/CodeGen/ShrinkWrap.cpp
+++ b/lib/CodeGen/ShrinkWrap.cpp
@@ -56,15 +56,17 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -73,8 +75,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp
index 3656832a7f1..25a1c37b145 100644
--- a/lib/CodeGen/SlotIndexes.cpp
+++ b/lib/CodeGen/SlotIndexes.cpp
@@ -10,9 +10,9 @@
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 1467179b7a3..7a9a65faaca 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -34,6 +34,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/LaneBitmask.h"
@@ -43,7 +44,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 8a47f3d2d6d..69614a55e14 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -28,12 +28,12 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
diff --git a/lib/CodeGen/TailDuplicator.cpp b/lib/CodeGen/TailDuplicator.cpp
index bd3a20f936d..5b76e36382c 100644
--- a/lib/CodeGen/TailDuplicator.cpp
+++ b/lib/CodeGen/TailDuplicator.cpp
@@ -12,13 +12,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/TailDuplicator.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -27,14 +28,13 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineSSAUpdater.h"
-#include "llvm/CodeGen/TailDuplicator.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
@@ -603,8 +603,8 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
     if (PreRegAlloc && MI.isCall())
       return false;
 
-    if (!MI.isPHI() && !MI.isDebugValue())
-      InstrCount += 1;
+    if (!MI.isPHI() && !MI.isMetaInstruction())
+        InstrCount += 1;
 
     if (InstrCount > MaxDuplicateCount)
       return false;
diff --git a/lib/CodeGen/TargetFrameLoweringImpl.cpp b/lib/CodeGen/TargetFrameLoweringImpl.cpp
index 9dd98b4020d..4d2130f5ebe 100644
--- a/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -20,7 +20,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -104,3 +104,12 @@ unsigned TargetFrameLowering::getStackAlignmentSkew(
 
   return 0;
 }
+
+int TargetFrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
+  llvm_unreachable("getInitialCFAOffset() not implemented!");
+}
+
+unsigned TargetFrameLowering::getInitialCFARegister(const MachineFunction &MF)
+    const {
+  llvm_unreachable("getInitialCFARegister() not implemented!");
+}
+\ No newline at end of file
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index bac12efd639..702b91b7f77 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -26,7 +27,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/TargetOptionsImpl.cpp b/lib/CodeGen/TargetOptionsImpl.cpp
index ed845e1706f..99ff4931e2f 100644
--- a/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/lib/CodeGen/TargetOptionsImpl.cpp
@@ -15,7 +15,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index c5101b1ecfc..59e88ba3bda 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -600,8 +600,14 @@ void TargetPassConfig::addIRPasses() {
       addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
   }
 
-  if (getOptLevel() != CodeGenOpt::None && EnableMergeICmps) {
-    addPass(createMergeICmpsPass());
+  if (getOptLevel() != CodeGenOpt::None) {
+    // The MergeICmpsPass tries to create memcmp calls by grouping sequences of
+    // loads and compares. ExpandMemCmpPass then tries to expand those calls
+    // into optimally-sized loads and compares. The transforms are enabled by a
+    // target lowering hook.
+    if (EnableMergeICmps)
+      addPass(createMergeICmpsPass());
+    addPass(createExpandMemCmpPass());
   }
 
   // Run GC lowering passes for builtin collectors
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index 55318237e95..758fdabf5dd 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -27,7 +27,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Printable.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <cassert>
diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp
index e1db9157f90..659d6f522af 100644
--- a/lib/CodeGen/TargetSchedule.cpp
+++ b/lib/CodeGen/TargetSchedule.cpp
@@ -16,13 +16,13 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/MC/MCSchedule.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
diff --git a/lib/CodeGen/TargetSubtargetInfo.cpp b/lib/CodeGen/TargetSubtargetInfo.cpp
index 29cfd9fb178..d66272aed9b 100644
--- a/lib/CodeGen/TargetSubtargetInfo.cpp
+++ b/lib/CodeGen/TargetSubtargetInfo.cpp
@@ -14,11 +14,11 @@
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include <string>
 
 using namespace llvm;
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index efd40b209e9..18f1485baca 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -46,6 +46,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Pass.h"
@@ -54,7 +55,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index bdd25f29aea..5288ca67277 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -30,6 +30,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Dominators.h"
@@ -37,7 +38,6 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
-#include "llvm/Target/TargetInstrInfo.h"
 using namespace llvm;
 
 static bool eliminateUnreachableBlock(Function &F) {
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 65c62b16719..d499c6c1e73 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -31,12 +31,12 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/CodeGen/XRayInstrumentation.cpp b/lib/CodeGen/XRayInstrumentation.cpp
index 2063ab11a74..fb621cab28b 100644
--- a/lib/CodeGen/XRayInstrumentation.cpp
+++ b/lib/CodeGen/XRayInstrumentation.cpp
@@ -23,10 +23,10 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Pass.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
diff --git a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
index a88dcfcf542..f593953c62f 100644
--- a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
+++ b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
@@ -184,7 +184,7 @@ Optional<DWARFFormValue> DWARFAbbreviationDeclaration::getAttributeValue(
         FormValue.setSValue(Spec.getImplicitConstValue());
         return FormValue;
       }
-      if (FormValue.extractValue(DebugInfoData, &Offset, &U))
+      if (FormValue.extractValue(DebugInfoData, &Offset, U.getFormParams(), &U))
         return FormValue;
     }
     // March Offset along until we get to the attribute we want.
diff --git a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index dbe6fe52407..f04ec7706cd 100644
--- a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -90,10 +90,11 @@ std::pair<uint32_t, dwarf::Tag>
 DWARFAcceleratorTable::readAtoms(uint32_t &HashDataOffset) {
   uint32_t DieOffset = dwarf::DW_INVALID_OFFSET;
   dwarf::Tag DieTag = dwarf::DW_TAG_null;
+  DWARFFormParams FormParams = {Hdr.Version, 0, dwarf::DwarfFormat::DWARF32};
 
   for (auto Atom : getAtomsDesc()) {
     DWARFFormValue FormValue(Atom.second);
-    FormValue.extractValue(AccelSection, &HashDataOffset, NULL);
+    FormValue.extractValue(AccelSection, &HashDataOffset, FormParams);
     switch (Atom.first) {
     case dwarf::DW_ATOM_die_offset:
       DieOffset = *FormValue.getAsUnsignedConstant();
@@ -145,6 +146,7 @@ LLVM_DUMP_METHOD void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
   uint32_t Offset = sizeof(Hdr) + Hdr.HeaderDataLength;
   unsigned HashesBase = Offset + Hdr.NumBuckets * 4;
   unsigned OffsetsBase = HashesBase + Hdr.NumHashes * 4;
+  DWARFFormParams FormParams = {Hdr.Version, 0, dwarf::DwarfFormat::DWARF32};
 
   for (unsigned Bucket = 0; Bucket < Hdr.NumBuckets; ++Bucket) {
     unsigned Index = AccelSection.getU32(&Offset);
@@ -181,7 +183,7 @@ LLVM_DUMP_METHOD void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
           unsigned i = 0;
           for (auto &Atom : AtomForms) {
             OS << format("{Atom[%d]: ", i++);
-            if (Atom.extractValue(AccelSection, &DataOffset, nullptr))
+            if (Atom.extractValue(AccelSection, &DataOffset, FormParams))
               Atom.dump(OS);
             else
               OS << "Error extracting the value";
@@ -216,8 +218,10 @@ void DWARFAcceleratorTable::ValueIterator::Next() {
     NumData = 0;
     return;
   }
+  DWARFFormParams FormParams = {AccelTable->Hdr.Version, 0,
+                                dwarf::DwarfFormat::DWARF32};
   for (auto &Atom : AtomForms)
-    Atom.extractValue(AccelSection, &DataOffset, nullptr);
+    Atom.extractValue(AccelSection, &DataOffset, FormParams);
   ++Data;
 }
 
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 24aa666fb81..881cd1dfd11 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -329,9 +329,9 @@ void DWARFContext::dump(
         // representation.
         OS << "debug_line[" << format("0x%8.8x", Offset) << "]\n";
         if (DumpOpts.Verbose) {
-          LineTable.parse(lineData, &Offset, &OS);
+          LineTable.parse(lineData, &Offset, &*CU, &OS);
         } else {
-          LineTable.parse(lineData, &Offset);
+          LineTable.parse(lineData, &Offset, &*CU);
           LineTable.dump(OS);
         }
       }
@@ -349,7 +349,7 @@ void DWARFContext::dump(
     DWARFDataExtractor lineData(*DObj, DObj->getLineDWOSection(),
                                 isLittleEndian(), savedAddressByteSize);
     DWARFDebugLine::LineTable LineTable;
-    while (LineTable.Prologue.parse(lineData, &stmtOffset)) {
+    while (LineTable.Prologue.parse(lineData, &stmtOffset, nullptr)) {
       LineTable.dump(OS);
       LineTable.clear();
     }
@@ -681,7 +681,7 @@ DWARFContext::getLineTableForUnit(DWARFUnit *U) {
   // We have to parse it first.
   DWARFDataExtractor lineData(*DObj, U->getLineSection(), isLittleEndian(),
                               U->getAddressByteSize());
-  return Line->getOrParseLineTable(lineData, stmtOffset);
+  return Line->getOrParseLineTable(lineData, stmtOffset, U);
 }
 
 void DWARFContext::parseCompileUnits() {
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index bd8dd0d0ede..c99c7a9277e 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -144,7 +144,7 @@ parseV5EntryFormat(const DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr,
 static bool
 parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
                      uint32_t *OffsetPtr, uint64_t EndPrologueOffset,
-                     const DWARFFormParams &FormParams,
+                     const DWARFFormParams &FormParams, const DWARFUnit *U,
                      std::vector<StringRef> &IncludeDirectories,
                      std::vector<DWARFDebugLine::FileNameEntry> &FileNames) {
   // Get the directory entry description.
@@ -162,7 +162,7 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
       DWARFFormValue Value(Descriptor.Form);
       switch (Descriptor.Type) {
       case DW_LNCT_path:
-        if (!Value.extractValue(DebugLineData, OffsetPtr, nullptr))
+        if (!Value.extractValue(DebugLineData, OffsetPtr, FormParams, U))
           return false;
         IncludeDirectories.push_back(Value.getAsCString().getValue());
         break;
@@ -187,7 +187,7 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
     DWARFDebugLine::FileNameEntry FileEntry;
     for (auto Descriptor : FileDescriptors) {
       DWARFFormValue Value(Descriptor.Form);
-      if (!Value.extractValue(DebugLineData, OffsetPtr, nullptr))
+      if (!Value.extractValue(DebugLineData, OffsetPtr, FormParams, U))
         return false;
       switch (Descriptor.Type) {
       case DW_LNCT_path:
@@ -213,7 +213,7 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
 }
 
 bool DWARFDebugLine::Prologue::parse(const DWARFDataExtractor &DebugLineData,
-                                     uint32_t *OffsetPtr) {
+                                     uint32_t *OffsetPtr, const DWARFUnit *U) {
   const uint64_t PrologueOffset = *OffsetPtr;
 
   clear();
@@ -253,7 +253,8 @@ bool DWARFDebugLine::Prologue::parse(const DWARFDataExtractor &DebugLineData,
 
   if (getVersion() >= 5) {
     if (!parseV5DirFileTables(DebugLineData, OffsetPtr, EndPrologueOffset,
-                              getFormParams(), IncludeDirectories, FileNames)) {
+                              getFormParams(), U, IncludeDirectories,
+                              FileNames)) {
       fprintf(stderr,
               "warning: parsing line table prologue at 0x%8.8" PRIx64
               " found an invalid directory or file table description at"
@@ -382,24 +383,25 @@ DWARFDebugLine::getLineTable(uint32_t Offset) const {
 
 const DWARFDebugLine::LineTable *
 DWARFDebugLine::getOrParseLineTable(const DWARFDataExtractor &DebugLineData,
-                                    uint32_t Offset) {
+                                    uint32_t Offset, const DWARFUnit *U) {
   std::pair<LineTableIter, bool> Pos =
       LineTableMap.insert(LineTableMapTy::value_type(Offset, LineTable()));
   LineTable *LT = &Pos.first->second;
   if (Pos.second) {
-    if (!LT->parse(DebugLineData, &Offset))
+    if (!LT->parse(DebugLineData, &Offset, U))
       return nullptr;
   }
   return LT;
 }
 
 bool DWARFDebugLine::LineTable::parse(const DWARFDataExtractor &DebugLineData,
-                                      uint32_t *OffsetPtr, raw_ostream *OS) {
+                                      uint32_t *OffsetPtr, const DWARFUnit *U,
+                                      raw_ostream *OS) {
   const uint32_t DebugLineOffset = *OffsetPtr;
 
   clear();
 
-  if (!Prologue.parse(DebugLineData, OffsetPtr)) {
+  if (!Prologue.parse(DebugLineData, OffsetPtr, U)) {
     // Restore our offset and return false to indicate failure!
     *OffsetPtr = DebugLineOffset;
     return false;
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index d20eabff7f0..a579c06d02e 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -208,7 +208,8 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
   DWARFUnit *U = Die.getDwarfUnit();
   DWARFFormValue formValue(Form);
 
-  if (!formValue.extractValue(U->getDebugInfoExtractor(), OffsetPtr, U))
+  if (!formValue.extractValue(U->getDebugInfoExtractor(), OffsetPtr,
+                              U->getFormParams(), U))
     return;
 
   OS << "\t(";
@@ -550,7 +551,7 @@ void DWARFDie::attribute_iterator::updateForIndex(
     auto U = Die.getDwarfUnit();
     assert(U && "Die must have valid DWARF unit");
     bool b = AttrValue.Value.extractValue(U->getDebugInfoExtractor(),
-                                          &ParseOffset, U);
+                                          &ParseOffset, U->getFormParams(), U);
     (void)b;
     assert(b && "extractValue cannot fail on fully parsed DWARF");
     AttrValue.ByteSize = ParseOffset - AttrValue.Offset;
diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index d63e84ef4e7..c4abd49797b 100644
--- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -276,7 +276,8 @@ bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
 }
 
 bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
-                                  uint32_t *OffsetPtr, const DWARFUnit *CU) {
+                                  uint32_t *OffsetPtr, DWARFFormParams FP,
+                                  const DWARFUnit *CU) {
   U = CU;
   bool Indirect = false;
   bool IsBlock = false;
@@ -288,10 +289,8 @@ bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
     switch (Form) {
     case DW_FORM_addr:
     case DW_FORM_ref_addr: {
-      if (!U)
-        return false;
-      uint16_t Size = (Form == DW_FORM_addr) ? U->getAddressByteSize()
-                                             : U->getRefAddrByteSize();
+      uint16_t Size =
+          (Form == DW_FORM_addr) ? FP.AddrSize : FP.getRefAddrByteSize();
       Value.uval = Data.getRelocatedValue(Size, OffsetPtr, &Value.SectionIndex);
       break;
     }
@@ -360,10 +359,8 @@ bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
     case DW_FORM_GNU_strp_alt:
     case DW_FORM_line_strp:
     case DW_FORM_strp_sup: {
-      if (!U)
-        return false;
       Value.uval =
-          Data.getRelocatedValue(U->getDwarfOffsetByteSize(), OffsetPtr);
+          Data.getRelocatedValue(FP.getDwarfOffsetByteSize(), OffsetPtr);
       break;
     }
     case DW_FORM_flag_present:
diff --git a/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp b/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
index dd8e2ac8b53..dee27c621fa 100644
--- a/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
@@ -176,9 +176,8 @@ Error PDBFileBuilder::commit(StringRef Filename) {
 
   uint64_t Filesize = Layout.SB->BlockSize * Layout.SB->NumBlocks;
   auto OutFileOrError = FileOutputBuffer::create(Filename, Filesize);
-  if (OutFileOrError.getError())
-    return llvm::make_error<pdb::GenericError>(generic_error_code::invalid_path,
-                                               Filename);
+  if (auto E = OutFileOrError.takeError())
+    return E;
   FileBufferByteStream Buffer(std::move(*OutFileOrError),
                               llvm::support::little);
   BinaryStreamWriter Writer(Buffer);
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 3f41a1dc066..0fafe82404e 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -1108,10 +1108,12 @@ static void writeAtomicRMWOperation(raw_ostream &Out,
 
 static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
   if (const FPMathOperator *FPO = dyn_cast<const FPMathOperator>(U)) {
-    // Unsafe algebra implies all the others, no need to write them all out
-    if (FPO->hasUnsafeAlgebra())
+    // 'Fast' is an abbreviation for all fast-math-flags.
+    if (FPO->isFast())
       Out << " fast";
     else {
+      if (FPO->hasAllowReassoc())
+        Out << " reassoc";
       if (FPO->hasNoNaNs())
         Out << " nnan";
       if (FPO->hasNoInfs())
@@ -1122,6 +1124,8 @@ static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
         Out << " arcp";
       if (FPO->hasAllowContract())
         Out << " contract";
+      if (FPO->hasApproxFunc())
+        Out << " afn";
     }
   }
 
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index 07d499bc193..2c9e9be3da5 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -78,6 +78,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name=="ssse3.pabs.d.128" || // Added in 6.0
       Name.startswith("avx2.pabs.") || // Added in 6.0
       Name.startswith("avx512.mask.pabs.") || // Added in 6.0
+      Name.startswith("avx512.broadcastm") || // Added in 6.0
       Name.startswith("avx512.mask.pbroadcast") || // Added in 6.0
       Name.startswith("sse2.pcmpeq.") || // Added in 3.1
       Name.startswith("sse2.pcmpgt.") || // Added in 3.1
@@ -1027,7 +1028,15 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = Builder.CreateICmp(CmpEq ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_SGT,
                                CI->getArgOperand(0), CI->getArgOperand(1));
       Rep = Builder.CreateSExt(Rep, CI->getType(), "");
-    } else if (IsX86 && (Name.startswith("avx512.mask.pbroadcast"))){
+    } else if (IsX86 && (Name.startswith("avx512.broadcastm"))) {
+      Type *ExtTy = Type::getInt32Ty(C);
+      if (CI->getOperand(0)->getType()->isIntegerTy(8))
+        ExtTy = Type::getInt64Ty(C);
+      unsigned NumElts = CI->getType()->getPrimitiveSizeInBits() /
+                         ExtTy->getPrimitiveSizeInBits();
+      Rep = Builder.CreateZExt(CI->getArgOperand(0), ExtTy);
+      Rep = Builder.CreateVectorSplat(NumElts, Rep);
+    } else if (IsX86 && (Name.startswith("avx512.mask.pbroadcast"))) {
       unsigned NumElts =
           CI->getArgOperand(1)->getType()->getVectorNumElements();
       Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp
index 2b780adf6c6..22513924a96 100644
--- a/lib/IR/BasicBlock.cpp
+++ b/lib/IR/BasicBlock.cpp
@@ -447,3 +447,16 @@ bool BasicBlock::isLandingPad() const {
 const LandingPadInst *BasicBlock::getLandingPadInst() const {
   return dyn_cast<LandingPadInst>(getFirstNonPHI());
 }
+
+Optional<uint64_t> BasicBlock::getIrrLoopHeaderWeight() const {
+  const TerminatorInst *TI = getTerminator();
+  if (MDNode *MDIrrLoopHeader =
+      TI->getMetadata(LLVMContext::MD_irr_loop)) {
+    MDString *MDName = cast<MDString>(MDIrrLoopHeader->getOperand(0));
+    if (MDName->getString().equals("loop_header_weight")) {
+      auto *CI = mdconst::extract<ConstantInt>(MDIrrLoopHeader->getOperand(1));
+      return Optional<uint64_t>(CI->getValue().getZExtValue());
+    }
+  }
+  return Optional<uint64_t>();
+}
diff --git a/lib/IR/CMakeLists.txt b/lib/IR/CMakeLists.txt
index eb4b9143090..17822bbbb5c 100644
--- a/lib/IR/CMakeLists.txt
+++ b/lib/IR/CMakeLists.txt
@@ -22,7 +22,6 @@ add_llvm_library(LLVMCore
   DiagnosticPrinter.cpp
   Dominators.cpp
   Function.cpp
-  GCOV.cpp
   GVMaterializer.cpp
   Globals.cpp
   IRBuilder.cpp
diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp
index b34383f5ada..df0c52d4463 100644
--- a/lib/IR/DebugInfo.cpp
+++ b/lib/IR/DebugInfo.cpp
@@ -676,25 +676,7 @@ unsigned llvm::getDebugMetadataVersionFromModule(const Module &M) {
 
 void Instruction::applyMergedLocation(const DILocation *LocA,
                                       const DILocation *LocB) {
-  if (LocA && LocB && (LocA == LocB || !LocA->canDiscriminate(*LocB))) {
-    setDebugLoc(LocA);
-    return;
-  }
-  if (!LocA || !LocB || !isa<CallInst>(this)) {
-    setDebugLoc(nullptr);
-    return;
-  }
-  SmallPtrSet<DILocation *, 5> InlinedLocationsA;
-  for (DILocation *L = LocA->getInlinedAt(); L; L = L->getInlinedAt())
-    InlinedLocationsA.insert(L);
-  const DILocation *Result = LocB;
-  for (DILocation *L = LocB->getInlinedAt(); L; L = L->getInlinedAt()) {
-    Result = L;
-    if (InlinedLocationsA.count(L))
-      break;
-  }
-  setDebugLoc(DILocation::get(
-      Result->getContext(), 0, 0, Result->getScope(), Result->getInlinedAt()));
+  setDebugLoc(DILocation::getMergedLocation(LocA, LocB, this));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp
index 645bba1652d..ae02392ea14 100644
--- a/lib/IR/DebugInfoMetadata.cpp
+++ b/lib/IR/DebugInfoMetadata.cpp
@@ -14,9 +14,11 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "LLVMContextImpl.h"
 #include "MetadataImpl.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
 
 using namespace llvm;
 
@@ -66,6 +68,31 @@ DILocation *DILocation::getImpl(LLVMContext &Context, unsigned Line,
                    Storage, Context.pImpl->DILocations);
 }
 
+const DILocation *
+DILocation::getMergedLocation(const DILocation *LocA, const DILocation *LocB,
+                              const Instruction *ForInst) {
+  if (!LocA || !LocB)
+    return nullptr;
+
+  if (LocA == LocB || !LocA->canDiscriminate(*LocB))
+    return LocA;
+
+  if (!dyn_cast_or_null<CallInst>(ForInst))
+    return nullptr;
+
+  SmallPtrSet<DILocation *, 5> InlinedLocationsA;
+  for (DILocation *L = LocA->getInlinedAt(); L; L = L->getInlinedAt())
+    InlinedLocationsA.insert(L);
+  const DILocation *Result = LocB;
+  for (DILocation *L = LocB->getInlinedAt(); L; L = L->getInlinedAt()) {
+    Result = L;
+    if (InlinedLocationsA.count(L))
+      break;
+  }
+  return DILocation::get(Result->getContext(), 0, 0, Result->getScope(),
+                         Result->getInlinedAt());
+}
+
 DINode::DIFlags DINode::getFlag(StringRef Flag) {
   return StringSwitch<DIFlags>(Flag)
 #define HANDLE_DI_FLAG(ID, NAME) .Case("DIFlag" #NAME, Flag##NAME)
@@ -726,14 +753,23 @@ DIExpression *DIExpression::prepend(const DIExpression *Expr, bool Deref,
   return DIExpression::get(Expr->getContext(), Ops);
 }
 
-DIExpression *DIExpression::createFragmentExpression(const DIExpression *Expr,
-                                                     unsigned OffsetInBits,
-                                                     unsigned SizeInBits) {
+Optional<DIExpression *> DIExpression::createFragmentExpression(
+    const DIExpression *Expr, unsigned OffsetInBits, unsigned SizeInBits) {
   SmallVector<uint64_t, 8> Ops;
   // Copy over the expression, but leave off any trailing DW_OP_LLVM_fragment.
   if (Expr) {
     for (auto Op : Expr->expr_ops()) {
-      if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
+      switch (Op.getOp()) {
+      default: break;
+      case dwarf::DW_OP_plus:
+      case dwarf::DW_OP_minus:
+        // We can't safely split arithmetic into multiple fragments because we
+        // can't express carry-over between fragments.
+        //
+        // FIXME: We *could* preserve the lowest fragment of a constant offset
+        // operation if the offset fits into SizeInBits.
+        return None;
+      case dwarf::DW_OP_LLVM_fragment: {
         // Make the new offset point into the existing fragment.
         uint64_t FragmentOffsetInBits = Op.getArg(0);
         // Op.getArg(0) is FragmentOffsetInBits.
@@ -741,7 +777,8 @@ DIExpression *DIExpression::createFragmentExpression(const DIExpression *Expr,
         assert((OffsetInBits + SizeInBits <= Op.getArg(0) + Op.getArg(1)) &&
                "new fragment outside of original fragment");
         OffsetInBits += FragmentOffsetInBits;
-        break;
+        continue;
+      }
       }
       Ops.push_back(Op.getOp());
       for (unsigned I = 0; I < Op.getNumArgs(); ++I)
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index ceb521c4c48..ffc3a30e6a1 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -146,9 +146,14 @@ bool Instruction::isExact() const {
   return cast<PossiblyExactOperator>(this)->isExact();
 }
 
-void Instruction::setHasUnsafeAlgebra(bool B) {
+void Instruction::setFast(bool B) {
   assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
-  cast<FPMathOperator>(this)->setHasUnsafeAlgebra(B);
+  cast<FPMathOperator>(this)->setFast(B);
+}
+
+void Instruction::setHasAllowReassoc(bool B) {
+  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  cast<FPMathOperator>(this)->setHasAllowReassoc(B);
 }
 
 void Instruction::setHasNoNaNs(bool B) {
@@ -171,6 +176,11 @@ void Instruction::setHasAllowReciprocal(bool B) {
   cast<FPMathOperator>(this)->setHasAllowReciprocal(B);
 }
 
+void Instruction::setHasApproxFunc(bool B) {
+  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  cast<FPMathOperator>(this)->setHasApproxFunc(B);
+}
+
 void Instruction::setFastMathFlags(FastMathFlags FMF) {
   assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
   cast<FPMathOperator>(this)->setFastMathFlags(FMF);
@@ -181,9 +191,14 @@ void Instruction::copyFastMathFlags(FastMathFlags FMF) {
   cast<FPMathOperator>(this)->copyFastMathFlags(FMF);
 }
 
-bool Instruction::hasUnsafeAlgebra() const {
+bool Instruction::isFast() const {
   assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
-  return cast<FPMathOperator>(this)->hasUnsafeAlgebra();
+  return cast<FPMathOperator>(this)->isFast();
+}
+
+bool Instruction::hasAllowReassoc() const {
+  assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
+  return cast<FPMathOperator>(this)->hasAllowReassoc();
 }
 
 bool Instruction::hasNoNaNs() const {
@@ -211,6 +226,11 @@ bool Instruction::hasAllowContract() const {
   return cast<FPMathOperator>(this)->hasAllowContract();
 }
 
+bool Instruction::hasApproxFunc() const {
+  assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
+  return cast<FPMathOperator>(this)->hasApproxFunc();
+}
+
 FastMathFlags Instruction::getFastMathFlags() const {
   assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
   return cast<FPMathOperator>(this)->getFastMathFlags();
@@ -579,7 +599,7 @@ bool Instruction::isAssociative() const {
   switch (Opcode) {
   case FMul:
   case FAdd:
-    return cast<FPMathOperator>(this)->hasUnsafeAlgebra();
+    return cast<FPMathOperator>(this)->isFast();
   default:
     return false;
   }
diff --git a/lib/IR/LLVMContext.cpp b/lib/IR/LLVMContext.cpp
index a94da5452b8..c8b7c10a9a4 100644
--- a/lib/IR/LLVMContext.cpp
+++ b/lib/IR/LLVMContext.cpp
@@ -60,6 +60,7 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
     {MD_absolute_symbol, "absolute_symbol"},
     {MD_associated, "associated"},
     {MD_callees, "callees"},
+    {MD_irr_loop, "irr_loop"},
   };
 
   for (auto &MDKind : MDKinds) {
diff --git a/lib/IR/MDBuilder.cpp b/lib/IR/MDBuilder.cpp
index 54783e884e9..d8e64db7c5d 100644
--- a/lib/IR/MDBuilder.cpp
+++ b/lib/IR/MDBuilder.cpp
@@ -197,3 +197,10 @@ MDNode *MDBuilder::createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType,
   }
   return MDNode::get(Context, {BaseType, AccessType, createConstant(Off)});
 }
+
+MDNode *MDBuilder::createIrrLoopHeaderWeight(uint64_t Weight) {
+  SmallVector<Metadata *, 2> Vals(2);
+  Vals[0] = createString("loop_header_weight");
+  Vals[1] = createConstant(ConstantInt::get(Type::getInt64Ty(Context), Weight));
+  return MDNode::get(Context, Vals);
+}
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index 51a7d424c1f..5df0c6d81cf 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -454,6 +454,28 @@ void Value::replaceUsesOutsideBlock(Value *New, BasicBlock *BB) {
   }
 }
 
+void Value::replaceUsesExceptBlockAddr(Value *New) {
+  use_iterator UI = use_begin(), E = use_end();
+  for (; UI != E;) {
+    Use &U = *UI;
+    ++UI;
+
+    if (isa<BlockAddress>(U.getUser()))
+      continue;
+
+    // Must handle Constants specially, we cannot call replaceUsesOfWith on a
+    // constant because they are uniqued.
+    if (auto *C = dyn_cast<Constant>(U.getUser())) {
+      if (!isa<GlobalValue>(C)) {
+        C->handleOperandChange(this, New);
+        continue;
+      }
+    }
+
+    U.set(New);
+  }
+}
+
 namespace {
 // Various metrics for how much to strip off of pointers.
 enum PointerStripKind {
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index c528f7167e7..5bb1f84d2e5 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -115,8 +115,6 @@
 
 using namespace llvm;
 
-static cl::opt<bool> VerifyDebugInfo("verify-debug-info", cl::init(true));
-
 namespace llvm {
 
 struct VerifierSupport {
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index 017dd201f9c..9c737795b5a 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -630,6 +630,9 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
           NonPrevailingComdats.insert(GV->getComdat());
         cast<GlobalObject>(GV)->setComdat(nullptr);
       }
+
+      // Set the 'local' flag based on the linker resolution for this symbol.
+      GV->setDSOLocal(Res.FinalDefinitionInLinkageUnit);
     }
     // Common resolution: collect the maximum size/alignment over all commons.
     // We also record if we see an instance of a common as prevailing, so that
@@ -643,7 +646,6 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
       CommonRes.Prevailing |= Res.Prevailing;
     }
 
-    // FIXME: use proposed local attribute for FinalDefinitionInLinkageUnit.
   }
   if (!M.getComdatSymbolTable().empty())
     for (GlobalValue &GV : M.global_values())
@@ -698,10 +700,10 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
     assert(ResI != ResE);
     SymbolResolution Res = *ResI++;
 
-    if (Res.Prevailing) {
-      if (!Sym.getIRName().empty()) {
-        auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier(
-            Sym.getIRName(), GlobalValue::ExternalLinkage, ""));
+    if (!Sym.getIRName().empty()) {
+      auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier(
+          Sym.getIRName(), GlobalValue::ExternalLinkage, ""));
+      if (Res.Prevailing) {
         ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier();
 
         // For linker redefined symbols (via --wrap or --defsym) we want to
@@ -713,6 +715,15 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
                   GUID, BM.getModuleIdentifier()))
             S->setLinkage(GlobalValue::WeakAnyLinkage);
       }
+
+      // If the linker resolved the symbol to a local definition then mark it
+      // as local in the summary for the module we are adding.
+      if (Res.FinalDefinitionInLinkageUnit) {
+        if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(
+                GUID, BM.getModuleIdentifier())) {
+          S->setDSOLocal(true);
+        }
+      }
     }
   }
 
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 9759c0c6c1d..87867c54fad 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -469,17 +469,15 @@ void LTOCodeGenerator::restoreLinkageForExternals() {
     if (I == ExternalSymbols.end())
       return;
 
-    GV.setLinkage(I->second);
-  };
-
-  std::for_each(MergedModule->begin(), MergedModule->end(), externalize);
-  std::for_each(MergedModule->global_begin(), MergedModule->global_end(),
-                externalize);
-  std::for_each(MergedModule->alias_begin(), MergedModule->alias_end(),
-                externalize);
-}
-
-void LTOCodeGenerator::verifyMergedModuleOnce() {
+    GV.setLinkage(I->second);
+  };
+
+  llvm::for_each(MergedModule->functions(), externalize);
+  llvm::for_each(MergedModule->globals(), externalize);
+  llvm::for_each(MergedModule->aliases(), externalize);
+}
+
+void LTOCodeGenerator::verifyMergedModuleOnce() {
   // Only run on the first call.
   if (HasVerifiedInput)
     return;
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 29b14414ea2..9a23e614f3a 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -290,7 +290,7 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
   case MCFragment::FT_Padding:
     return cast<MCPaddingFragment>(F).getSize();
 
-  case MCFragment::FT_SafeSEH:
+  case MCFragment::FT_SymbolId:
     return 4;
 
   case MCFragment::FT_Align: {
@@ -563,8 +563,8 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
     break;
   }
 
-  case MCFragment::FT_SafeSEH: {
-    const MCSafeSEHFragment &SF = cast<MCSafeSEHFragment>(F);
+  case MCFragment::FT_SymbolId: {
+    const MCSymbolIdFragment &SF = cast<MCSymbolIdFragment>(F);
     OW->write32(SF.getSymbol()->getIndex());
     break;
   }
diff --git a/lib/MC/MCFragment.cpp b/lib/MC/MCFragment.cpp
index 94839de14f8..1aed50aaeb7 100644
--- a/lib/MC/MCFragment.cpp
+++ b/lib/MC/MCFragment.cpp
@@ -281,8 +281,8 @@ void MCFragment::destroy() {
     case FT_Padding:
       delete cast<MCPaddingFragment>(this);
       return;
-    case FT_SafeSEH:
-      delete cast<MCSafeSEHFragment>(this);
+    case FT_SymbolId:
+      delete cast<MCSymbolIdFragment>(this);
       return;
     case FT_CVInlineLines:
       delete cast<MCCVInlineLineTableFragment>(this);
@@ -326,7 +326,7 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   case MCFragment::FT_DwarfFrame: OS << "MCDwarfCallFrameFragment"; break;
   case MCFragment::FT_LEB:   OS << "MCLEBFragment"; break;
   case MCFragment::FT_Padding: OS << "MCPaddingFragment"; break;
-  case MCFragment::FT_SafeSEH:    OS << "MCSafeSEHFragment"; break;
+  case MCFragment::FT_SymbolId:    OS << "MCSymbolIdFragment"; break;
   case MCFragment::FT_CVInlineLines: OS << "MCCVInlineLineTableFragment"; break;
   case MCFragment::FT_CVDefRange: OS << "MCCVDefRangeTableFragment"; break;
   case MCFragment::FT_Dummy: OS << "MCDummyFragment"; break;
@@ -436,8 +436,8 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
     OS << "\n       ";
     break;
   }
-  case MCFragment::FT_SafeSEH: {
-    const MCSafeSEHFragment *F = cast<MCSafeSEHFragment>(this);
+  case MCFragment::FT_SymbolId: {
+    const MCSymbolIdFragment *F = cast<MCSymbolIdFragment>(this);
     OS << "\n       ";
     OS << " Sym:" << F->getSymbol();
     break;
diff --git a/lib/MC/MCWinCOFFStreamer.cpp b/lib/MC/MCWinCOFFStreamer.cpp
index 7e0533b8e00..c2583d95c5e 100644
--- a/lib/MC/MCWinCOFFStreamer.cpp
+++ b/lib/MC/MCWinCOFFStreamer.cpp
@@ -182,7 +182,7 @@ void MCWinCOFFStreamer::EmitCOFFSafeSEH(MCSymbol const *Symbol) {
   if (SXData->getAlignment() < 4)
     SXData->setAlignment(4);
 
-  new MCSafeSEHFragment(Symbol, SXData);
+  new MCSymbolIdFragment(Symbol, SXData);
 
   getAssembler().registerSymbol(*Symbol);
   CSymbol->setIsSafeSEH();
diff --git a/lib/Object/ArchiveWriter.cpp b/lib/Object/ArchiveWriter.cpp
index 919e2676802..63f5082c29d 100644
--- a/lib/Object/ArchiveWriter.cpp
+++ b/lib/Object/ArchiveWriter.cpp
@@ -122,11 +122,11 @@ static void printWithSpacePadding(raw_ostream &OS, T Data, unsigned Size) {
 static bool isBSDLike(object::Archive::Kind Kind) {
   switch (Kind) {
   case object::Archive::K_GNU:
+  case object::Archive::K_GNU64:
     return false;
   case object::Archive::K_BSD:
   case object::Archive::K_DARWIN:
     return true;
-  case object::Archive::K_GNU64:
   case object::Archive::K_DARWIN64:
   case object::Archive::K_COFF:
     break;
@@ -134,8 +134,8 @@ static bool isBSDLike(object::Archive::Kind Kind) {
   llvm_unreachable("not supported for writting");
 }
 
-static void print32(raw_ostream &Out, object::Archive::Kind Kind,
-                    uint32_t Val) {
+template <class T>
+static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val) {
   if (isBSDLike(Kind))
     support::endian::Writer<support::little>(Out).write(Val);
   else
@@ -216,6 +216,20 @@ static std::string computeRelativePath(StringRef From, StringRef To) {
   return Relative.str();
 }
 
+static bool is64BitKind(object::Archive::Kind Kind) {
+  switch (Kind) {
+  case object::Archive::K_GNU:
+  case object::Archive::K_BSD:
+  case object::Archive::K_DARWIN:
+  case object::Archive::K_COFF:
+    return false;
+  case object::Archive::K_DARWIN64:
+  case object::Archive::K_GNU64:
+    return true;
+  }
+  llvm_unreachable("not supported for writting");
+}
+
 static void addToStringTable(raw_ostream &Out, StringRef ArcName,
                              const NewArchiveMember &M, bool Thin) {
   StringRef ID = M.Buf->getBufferIdentifier();
@@ -288,6 +302,14 @@ static bool isArchiveSymbol(const object::BasicSymbolRef &S) {
   return true;
 }
 
+static void printNBits(raw_ostream &Out, object::Archive::Kind Kind,
+                       uint64_t Val) {
+  if (is64BitKind(Kind))
+    print<uint64_t>(Out, Kind, Val);
+  else
+    print<uint32_t>(Out, Kind, Val);
+}
+
 static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
                              bool Deterministic, ArrayRef<MemberData> Members,
                              StringRef StringTable) {
@@ -299,9 +321,11 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
     NumSyms += M.Symbols.size();
 
   unsigned Size = 0;
-  Size += 4; // Number of entries
+  Size += is64BitKind(Kind) ? 8 : 4; // Number of entries
   if (isBSDLike(Kind))
     Size += NumSyms * 8; // Table
+  else if (is64BitKind(Kind))
+    Size += NumSyms * 8; // Table
   else
     Size += NumSyms * 4; // Table
   if (isBSDLike(Kind))
@@ -318,27 +342,30 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
   if (isBSDLike(Kind))
     printBSDMemberHeader(Out, Out.tell(), "__.SYMDEF", now(Deterministic), 0, 0,
                          0, Size);
+  else if (is64BitKind(Kind))
+    printGNUSmallMemberHeader(Out, "/SYM64", now(Deterministic), 0, 0, 0, Size);
   else
     printGNUSmallMemberHeader(Out, "", now(Deterministic), 0, 0, 0, Size);
 
   uint64_t Pos = Out.tell() + Size;
 
   if (isBSDLike(Kind))
-    print32(Out, Kind, NumSyms * 8);
+    print<uint32_t>(Out, Kind, NumSyms * 8);
   else
-    print32(Out, Kind, NumSyms);
+    printNBits(Out, Kind, NumSyms);
 
   for (const MemberData &M : Members) {
     for (unsigned StringOffset : M.Symbols) {
       if (isBSDLike(Kind))
-        print32(Out, Kind, StringOffset);
-      print32(Out, Kind, Pos); // member offset
+        print<uint32_t>(Out, Kind, StringOffset);
+      printNBits(Out, Kind, Pos); // member offset
     }
     Pos += M.Header.size() + M.Data.size() + M.Padding.size();
   }
 
   if (isBSDLike(Kind))
-    print32(Out, Kind, StringTable.size()); // byte count of the string table
+    // byte count of the string table
+    print<uint32_t>(Out, Kind, StringTable.size());
   Out << StringTable;
 
   while (Pad--)
@@ -442,6 +469,25 @@ Error llvm::writeArchive(StringRef ArcName,
   if (!StringTableBuf.empty())
     Data.insert(Data.begin(), computeStringTable(StringTableBuf));
 
+  // We would like to detect if we need to switch to a 64-bit symbol table.
+  if (WriteSymtab) {
+    uint64_t MaxOffset = 0;
+    uint64_t LastOffset = MaxOffset;
+    for (const auto& M : Data) {
+      // Record the start of the member's offset
+      LastOffset = MaxOffset;
+      // Account for the size of each part associated with the member.
+      MaxOffset += M.Header.size() + M.Data.size() + M.Padding.size();
+      // We assume 32-bit symbols to see if 32-bit symbols are possible or not.
+      MaxOffset += M.Symbols.size() * 4;
+    }
+    // If LastOffset isn't going to fit in a 32-bit varible we need to switch
+    // to 64-bit. Note that the file can be larger than 4GB as long as the last
+    // member starts before the 4GB offset.
+    if (LastOffset >> 32 != 0)
+      Kind = object::Archive::K_GNU64;
+  }
+
   SmallString<128> TmpArchive;
   int TmpArchiveFD;
   if (auto EC = sys::fs::createUniqueFile(ArcName + ".temp-archive-%%%%%%%.a",
diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp
index ef8c844a66f..c72a1258c1e 100644
--- a/lib/Object/ELF.cpp
+++ b/lib/Object/ELF.cpp
@@ -215,269 +215,6 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
 }
 
 template <class ELFT>
-Expected<uint32_t>
-ELFFile<ELFT>::getSectionIndex(const Elf_Sym *Sym, Elf_Sym_Range Syms,
-                               ArrayRef<Elf_Word> ShndxTable) const {
-  uint32_t Index = Sym->st_shndx;
-  if (Index == ELF::SHN_XINDEX) {
-    auto ErrorOrIndex = getExtendedSymbolTableIndex<ELFT>(
-        Sym, Syms.begin(), ShndxTable);
-    if (!ErrorOrIndex)
-      return ErrorOrIndex.takeError();
-    return *ErrorOrIndex;
-  }
-  if (Index == ELF::SHN_UNDEF || Index >= ELF::SHN_LORESERVE)
-    return 0;
-  return Index;
-}
-
-template <class ELFT>
-Expected<const typename ELFT::Shdr *>
-ELFFile<ELFT>::getSection(const Elf_Sym *Sym, const Elf_Shdr *SymTab,
-                          ArrayRef<Elf_Word> ShndxTable) const {
-  auto SymsOrErr = symbols(SymTab);
-  if (!SymsOrErr)
-    return SymsOrErr.takeError();
-  return getSection(Sym, *SymsOrErr, ShndxTable);
-}
-
-template <class ELFT>
-Expected<const typename ELFT::Shdr *>
-ELFFile<ELFT>::getSection(const Elf_Sym *Sym, Elf_Sym_Range Symbols,
-                          ArrayRef<Elf_Word> ShndxTable) const {
-  auto IndexOrErr = getSectionIndex(Sym, Symbols, ShndxTable);
-  if (!IndexOrErr)
-    return IndexOrErr.takeError();
-  uint32_t Index = *IndexOrErr;
-  if (Index == 0)
-    return nullptr;
-  return getSection(Index);
-}
-
-template <class ELFT>
-Expected<const typename ELFT::Sym *>
-ELFFile<ELFT>::getSymbol(const Elf_Shdr *Sec, uint32_t Index) const {
-  auto SymtabOrErr = symbols(Sec);
-  if (!SymtabOrErr)
-    return SymtabOrErr.takeError();
-  return object::getSymbol<ELFT>(*SymtabOrErr, Index);
-}
-
-template <class ELFT>
-Expected<ArrayRef<uint8_t>>
-ELFFile<ELFT>::getSectionContents(const Elf_Shdr *Sec) const {
-  return getSectionContentsAsArray<uint8_t>(Sec);
-}
-
-template <class ELFT>
-StringRef ELFFile<ELFT>::getRelocationTypeName(uint32_t Type) const {
-  return getELFRelocationTypeName(getHeader()->e_machine, Type);
-}
-
-template <class ELFT>
-void ELFFile<ELFT>::getRelocationTypeName(uint32_t Type,
-                                          SmallVectorImpl<char> &Result) const {
-  if (!isMipsELF64()) {
-    StringRef Name = getRelocationTypeName(Type);
-    Result.append(Name.begin(), Name.end());
-  } else {
-    // The Mips N64 ABI allows up to three operations to be specified per
-    // relocation record. Unfortunately there's no easy way to test for the
-    // presence of N64 ELFs as they have no special flag that identifies them
-    // as being N64. We can safely assume at the moment that all Mips
-    // ELFCLASS64 ELFs are N64. New Mips64 ABIs should provide enough
-    // information to disambiguate between old vs new ABIs.
-    uint8_t Type1 = (Type >> 0) & 0xFF;
-    uint8_t Type2 = (Type >> 8) & 0xFF;
-    uint8_t Type3 = (Type >> 16) & 0xFF;
-
-    // Concat all three relocation type names.
-    StringRef Name = getRelocationTypeName(Type1);
-    Result.append(Name.begin(), Name.end());
-
-    Name = getRelocationTypeName(Type2);
-    Result.append(1, '/');
-    Result.append(Name.begin(), Name.end());
-
-    Name = getRelocationTypeName(Type3);
-    Result.append(1, '/');
-    Result.append(Name.begin(), Name.end());
-  }
-}
-
-template <class ELFT>
-Expected<const typename ELFT::Sym *>
-ELFFile<ELFT>::getRelocationSymbol(const Elf_Rel *Rel,
-                                   const Elf_Shdr *SymTab) const {
-  uint32_t Index = Rel->getSymbol(isMips64EL());
-  if (Index == 0)
-    return nullptr;
-  return getEntry<Elf_Sym>(SymTab, Index);
-}
-
-template <class ELFT>
-Expected<StringRef>
-ELFFile<ELFT>::getSectionStringTable(Elf_Shdr_Range Sections) const {
-  uint32_t Index = getHeader()->e_shstrndx;
-  if (Index == ELF::SHN_XINDEX)
-    Index = Sections[0].sh_link;
-
-  if (!Index) // no section string table.
-    return "";
-  if (Index >= Sections.size())
-    return createError("invalid section index");
-  return getStringTable(&Sections[Index]);
-}
-
-template <class ELFT> ELFFile<ELFT>::ELFFile(StringRef Object) : Buf(Object) {}
-
-template <class ELFT>
-Expected<ELFFile<ELFT>> ELFFile<ELFT>::create(StringRef Object) {
-  if (sizeof(Elf_Ehdr) > Object.size())
-    return createError("Invalid buffer");
-  return ELFFile(Object);
-}
-
-template <class ELFT>
-Expected<typename ELFT::ShdrRange> ELFFile<ELFT>::sections() const {
-  const uintX_t SectionTableOffset = getHeader()->e_shoff;
-  if (SectionTableOffset == 0)
-    return ArrayRef<Elf_Shdr>();
-
-  if (getHeader()->e_shentsize != sizeof(Elf_Shdr))
-    return createError(
-        "invalid section header entry size (e_shentsize) in ELF header");
-
-  const uint64_t FileSize = Buf.size();
-
-  if (SectionTableOffset + sizeof(Elf_Shdr) > FileSize)
-    return createError("section header table goes past the end of the file");
-
-  // Invalid address alignment of section headers
-  if (SectionTableOffset & (alignof(Elf_Shdr) - 1))
-    return createError("invalid alignment of section headers");
-
-  const Elf_Shdr *First =
-      reinterpret_cast<const Elf_Shdr *>(base() + SectionTableOffset);
-
-  uintX_t NumSections = getHeader()->e_shnum;
-  if (NumSections == 0)
-    NumSections = First->sh_size;
-
-  if (NumSections > UINT64_MAX / sizeof(Elf_Shdr))
-    return createError("section table goes past the end of file");
-
-  const uint64_t SectionTableSize = NumSections * sizeof(Elf_Shdr);
-
-  // Section table goes past end of file!
-  if (SectionTableOffset + SectionTableSize > FileSize)
-    return createError("section table goes past the end of file");
-
-  return makeArrayRef(First, NumSections);
-}
-
-template <class ELFT>
-Expected<const typename ELFT::Shdr *>
-ELFFile<ELFT>::getSection(uint32_t Index) const {
-  auto TableOrErr = sections();
-  if (!TableOrErr)
-    return TableOrErr.takeError();
-  return object::getSection<ELFT>(*TableOrErr, Index);
-}
-
-template <class ELFT>
-Expected<StringRef>
-ELFFile<ELFT>::getStringTable(const Elf_Shdr *Section) const {
-  if (Section->sh_type != ELF::SHT_STRTAB)
-    return createError("invalid sh_type for string table, expected SHT_STRTAB");
-  auto V = getSectionContentsAsArray<char>(Section);
-  if (!V)
-    return V.takeError();
-  ArrayRef<char> Data = *V;
-  if (Data.empty())
-    return createError("empty string table");
-  if (Data.back() != '\0')
-    return createError("string table non-null terminated");
-  return StringRef(Data.begin(), Data.size());
-}
-
-template <class ELFT>
-Expected<ArrayRef<typename ELFT::Word>>
-ELFFile<ELFT>::getSHNDXTable(const Elf_Shdr &Section) const {
-  auto SectionsOrErr = sections();
-  if (!SectionsOrErr)
-    return SectionsOrErr.takeError();
-  return getSHNDXTable(Section, *SectionsOrErr);
-}
-
-template <class ELFT>
-Expected<ArrayRef<typename ELFT::Word>>
-ELFFile<ELFT>::getSHNDXTable(const Elf_Shdr &Section,
-                             Elf_Shdr_Range Sections) const {
-  assert(Section.sh_type == ELF::SHT_SYMTAB_SHNDX);
-  auto VOrErr = getSectionContentsAsArray<Elf_Word>(&Section);
-  if (!VOrErr)
-    return VOrErr.takeError();
-  ArrayRef<Elf_Word> V = *VOrErr;
-  auto SymTableOrErr = object::getSection<ELFT>(Sections, Section.sh_link);
-  if (!SymTableOrErr)
-    return SymTableOrErr.takeError();
-  const Elf_Shdr &SymTable = **SymTableOrErr;
-  if (SymTable.sh_type != ELF::SHT_SYMTAB &&
-      SymTable.sh_type != ELF::SHT_DYNSYM)
-    return createError("invalid sh_type");
-  if (V.size() != (SymTable.sh_size / sizeof(Elf_Sym)))
-    return createError("invalid section contents size");
-  return V;
-}
-
-template <class ELFT>
-Expected<StringRef>
-ELFFile<ELFT>::getStringTableForSymtab(const Elf_Shdr &Sec) const {
-  auto SectionsOrErr = sections();
-  if (!SectionsOrErr)
-    return SectionsOrErr.takeError();
-  return getStringTableForSymtab(Sec, *SectionsOrErr);
-}
-
-template <class ELFT>
-Expected<StringRef>
-ELFFile<ELFT>::getStringTableForSymtab(const Elf_Shdr &Sec,
-                                       Elf_Shdr_Range Sections) const {
-
-  if (Sec.sh_type != ELF::SHT_SYMTAB && Sec.sh_type != ELF::SHT_DYNSYM)
-    return createError(
-        "invalid sh_type for symbol table, expected SHT_SYMTAB or SHT_DYNSYM");
-  auto SectionOrErr = object::getSection<ELFT>(Sections, Sec.sh_link);
-  if (!SectionOrErr)
-    return SectionOrErr.takeError();
-  return getStringTable(*SectionOrErr);
-}
-
-template <class ELFT>
-Expected<StringRef>
-ELFFile<ELFT>::getSectionName(const Elf_Shdr *Section) const {
-  auto SectionsOrErr = sections();
-  if (!SectionsOrErr)
-    return SectionsOrErr.takeError();
-  auto Table = getSectionStringTable(*SectionsOrErr);
-  if (!Table)
-    return Table.takeError();
-  return getSectionName(Section, *Table);
-}
-
-template <class ELFT>
-Expected<StringRef> ELFFile<ELFT>::getSectionName(const Elf_Shdr *Section,
-                                                  StringRef DotShstrtab) const {
-  uint32_t Offset = Section->sh_name;
-  if (Offset == 0)
-    return StringRef();
-  if (Offset >= DotShstrtab.size())
-    return createError("invalid string offset");
-  return StringRef(DotShstrtab.data() + Offset);
-}
-
-template <class ELFT>
 Expected<std::vector<typename ELFT::Rela>>
 ELFFile<ELFT>::android_relas(const Elf_Shdr *Sec) const {
   // This function reads relocations in Android's packed relocation format,
diff --git a/lib/ObjectYAML/COFFYAML.cpp b/lib/ObjectYAML/COFFYAML.cpp
index 1103159fc98..056a1aa3ca1 100644
--- a/lib/ObjectYAML/COFFYAML.cpp
+++ b/lib/ObjectYAML/COFFYAML.cpp
@@ -178,6 +178,46 @@ void ScalarEnumerationTraits<COFF::RelocationTypeAMD64>::enumeration(
   ECase(IMAGE_REL_AMD64_SSPAN32);
 }
 
+void ScalarEnumerationTraits<COFF::RelocationTypesARM>::enumeration(
+    IO &IO, COFF::RelocationTypesARM &Value) {
+  ECase(IMAGE_REL_ARM_ABSOLUTE);
+  ECase(IMAGE_REL_ARM_ADDR32);
+  ECase(IMAGE_REL_ARM_ADDR32NB);
+  ECase(IMAGE_REL_ARM_BRANCH24);
+  ECase(IMAGE_REL_ARM_BRANCH11);
+  ECase(IMAGE_REL_ARM_TOKEN);
+  ECase(IMAGE_REL_ARM_BLX24);
+  ECase(IMAGE_REL_ARM_BLX11);
+  ECase(IMAGE_REL_ARM_SECTION);
+  ECase(IMAGE_REL_ARM_SECREL);
+  ECase(IMAGE_REL_ARM_MOV32A);
+  ECase(IMAGE_REL_ARM_MOV32T);
+  ECase(IMAGE_REL_ARM_BRANCH20T);
+  ECase(IMAGE_REL_ARM_BRANCH24T);
+  ECase(IMAGE_REL_ARM_BLX23T);
+}
+
+void ScalarEnumerationTraits<COFF::RelocationTypesARM64>::enumeration(
+    IO &IO, COFF::RelocationTypesARM64 &Value) {
+  ECase(IMAGE_REL_ARM64_ABSOLUTE);
+  ECase(IMAGE_REL_ARM64_ADDR32);
+  ECase(IMAGE_REL_ARM64_ADDR32NB);
+  ECase(IMAGE_REL_ARM64_BRANCH26);
+  ECase(IMAGE_REL_ARM64_PAGEBASE_REL21);
+  ECase(IMAGE_REL_ARM64_REL21);
+  ECase(IMAGE_REL_ARM64_PAGEOFFSET_12A);
+  ECase(IMAGE_REL_ARM64_PAGEOFFSET_12L);
+  ECase(IMAGE_REL_ARM64_SECREL);
+  ECase(IMAGE_REL_ARM64_SECREL_LOW12A);
+  ECase(IMAGE_REL_ARM64_SECREL_HIGH12A);
+  ECase(IMAGE_REL_ARM64_SECREL_LOW12L);
+  ECase(IMAGE_REL_ARM64_TOKEN);
+  ECase(IMAGE_REL_ARM64_SECTION);
+  ECase(IMAGE_REL_ARM64_ADDR64);
+  ECase(IMAGE_REL_ARM64_BRANCH19);
+  ECase(IMAGE_REL_ARM64_BRANCH14);
+}
+
 void ScalarEnumerationTraits<COFF::WindowsSubsystem>::enumeration(
     IO &IO, COFF::WindowsSubsystem &Value) {
   ECase(IMAGE_SUBSYSTEM_UNKNOWN);
@@ -378,6 +418,14 @@ void MappingTraits<COFFYAML::Relocation>::mapping(IO &IO,
     MappingNormalization<NType<COFF::RelocationTypeAMD64>, uint16_t> NT(
         IO, Rel.Type);
     IO.mapRequired("Type", NT->Type);
+  } else if (H.Machine == COFF::IMAGE_FILE_MACHINE_ARMNT) {
+    MappingNormalization<NType<COFF::RelocationTypesARM>, uint16_t> NT(
+        IO, Rel.Type);
+    IO.mapRequired("Type", NT->Type);
+  } else if (H.Machine == COFF::IMAGE_FILE_MACHINE_ARM64) {
+    MappingNormalization<NType<COFF::RelocationTypesARM64>, uint16_t> NT(
+        IO, Rel.Type);
+    IO.mapRequired("Type", NT->Type);
   } else {
     IO.mapRequired("Type", Rel.Type);
   }
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index 8796ff56e5e..9abbdba26cb 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -89,6 +89,7 @@
 #include "llvm/Transforms/Scalar/ADCE.h"
 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
 #include "llvm/Transforms/Scalar/BDCE.h"
+#include "llvm/Transforms/Scalar/CallSiteSplitting.h"
 #include "llvm/Transforms/Scalar/ConstantHoisting.h"
 #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
 #include "llvm/Transforms/Scalar/DCE.h"
@@ -548,6 +549,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   EarlyFPM.addPass(SROA());
   EarlyFPM.addPass(EarlyCSEPass());
   EarlyFPM.addPass(LowerExpectIntrinsicPass());
+  if (Level == O3)
+    EarlyFPM.addPass(CallSiteSplittingPass());
+
   // In SamplePGO ThinLTO backend, we need instcombine before profile annotation
   // to convert bitcast to direct calls so that they can be inlined during the
   // profile annotation prepration step.
@@ -915,13 +919,16 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   MPM.addPass(InferFunctionAttrsPass());
 
   if (Level > 1) {
+    FunctionPassManager EarlyFPM(DebugLogging);
+    EarlyFPM.addPass(CallSiteSplittingPass());
+    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM)));
+
     // Indirect call promotion. This should promote all the targets that are
     // left by the earlier promotion pass that promotes intra-module targets.
     // This two-step promotion is to save the compile time. For LTO, it should
     // produce the same result as if we only do promotion here.
     MPM.addPass(PGOIndirectCallPromotion(
         true /* InLTO */, PGOOpt && !PGOOpt->SampleProfileFile.empty()));
-
     // Propagate constants at call sites into the functions they call.  This
     // opens opportunities for globalopt (and inlining) by substituting function
     // pointers passed as arguments to direct uses of functions.
diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def
index 20d1220ac33..40b884351fd 100644
--- a/lib/Passes/PassRegistry.def
+++ b/lib/Passes/PassRegistry.def
@@ -140,6 +140,7 @@ FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass())
 FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass())
 FUNCTION_PASS("bdce", BDCEPass())
 FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass())
+FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass())
 FUNCTION_PASS("consthoist", ConstantHoistingPass())
 FUNCTION_PASS("correlated-propagation", CorrelatedValuePropagationPass())
 FUNCTION_PASS("dce", DCEPass())
diff --git a/lib/ProfileData/CMakeLists.txt b/lib/ProfileData/CMakeLists.txt
index cd65762ae6a..3a981d8acf4 100644
--- a/lib/ProfileData/CMakeLists.txt
+++ b/lib/ProfileData/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_library(LLVMProfileData
+  GCOV.cpp
   InstrProf.cpp
   InstrProfReader.cpp
   InstrProfWriter.cpp
diff --git a/lib/IR/GCOV.cpp b/lib/ProfileData/GCOV.cpp
index d4b45522822..d6e44389f2b 100644
--- a/lib/IR/GCOV.cpp
+++ b/lib/ProfileData/GCOV.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/GCOV.h"
+#include "llvm/ProfileData/GCOV.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
diff --git a/lib/Support/Chrono.cpp b/lib/Support/Chrono.cpp
index a39b485bd13..84f5aab6fc4 100644
--- a/lib/Support/Chrono.cpp
+++ b/lib/Support/Chrono.cpp
@@ -65,17 +65,17 @@ void format_provider<TimePoint<>>::format(const TimePoint<> &T, raw_ostream &OS,
     if (Style[I] == '%' && Style.size() > I + 1) switch (Style[I + 1]) {
         case 'L':  // Milliseconds, from Ruby.
           FStream << llvm::format(
-              "%.3lu", duration_cast<milliseconds>(Fractional).count());
+              "%.3lu", (long)duration_cast<milliseconds>(Fractional).count());
           ++I;
           continue;
         case 'f':  // Microseconds, from Python.
           FStream << llvm::format(
-              "%.6lu", duration_cast<microseconds>(Fractional).count());
+              "%.6lu", (long)duration_cast<microseconds>(Fractional).count());
           ++I;
           continue;
         case 'N':  // Nanoseconds, from date(1).
           FStream << llvm::format(
-              "%.6lu", duration_cast<nanoseconds>(Fractional).count());
+              "%.6lu", (long)duration_cast<nanoseconds>(Fractional).count());
           ++I;
           continue;
         case '%':  // Consume %%, so %%f parses as (%%)f not %(%f)
diff --git a/lib/Support/FileOutputBuffer.cpp b/lib/Support/FileOutputBuffer.cpp
index 1e20b01fc4a..8906be3aaa2 100644
--- a/lib/Support/FileOutputBuffer.cpp
+++ b/lib/Support/FileOutputBuffer.cpp
@@ -38,7 +38,7 @@ public:
                std::unique_ptr<fs::mapped_file_region> Buf)
       : FileOutputBuffer(Path), Buffer(std::move(Buf)), TempPath(TempPath) {}
 
-  static ErrorOr<std::unique_ptr<OnDiskBuffer>>
+  static Expected<std::unique_ptr<OnDiskBuffer>>
   create(StringRef Path, size_t Size, unsigned Mode);
 
   uint8_t *getBufferStart() const override { return (uint8_t *)Buffer->data(); }
@@ -49,14 +49,14 @@ public:
 
   size_t getBufferSize() const override { return Buffer->size(); }
 
-  std::error_code commit() override {
+  Error commit() override {
     // Unmap buffer, letting OS flush dirty pages to file on disk.
     Buffer.reset();
 
     // Atomically replace the existing file with the new one.
     auto EC = fs::rename(TempPath, FinalPath);
     sys::DontRemoveFileOnSignal(TempPath);
-    return EC;
+    return errorCodeToError(EC);
   }
 
   ~OnDiskBuffer() override {
@@ -78,13 +78,13 @@ public:
   InMemoryBuffer(StringRef Path, MemoryBlock Buf, unsigned Mode)
       : FileOutputBuffer(Path), Buffer(Buf), Mode(Mode) {}
 
-  static ErrorOr<std::unique_ptr<InMemoryBuffer>>
+  static Expected<std::unique_ptr<InMemoryBuffer>>
   create(StringRef Path, size_t Size, unsigned Mode) {
     std::error_code EC;
     MemoryBlock MB = Memory::allocateMappedMemory(
         Size, nullptr, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC);
     if (EC)
-      return EC;
+      return errorCodeToError(EC);
     return llvm::make_unique<InMemoryBuffer>(Path, MB, Mode);
   }
 
@@ -96,14 +96,14 @@ public:
 
   size_t getBufferSize() const override { return Buffer.size(); }
 
-  std::error_code commit() override {
+  Error commit() override {
     int FD;
     std::error_code EC;
     if (auto EC = openFileForWrite(FinalPath, FD, fs::F_None, Mode))
-      return EC;
+      return errorCodeToError(EC);
     raw_fd_ostream OS(FD, /*shouldClose=*/true, /*unbuffered=*/true);
     OS << StringRef((const char *)Buffer.base(), Buffer.size());
-    return std::error_code();
+    return Error::success();
   }
 
 private:
@@ -111,13 +111,13 @@ private:
   unsigned Mode;
 };
 
-ErrorOr<std::unique_ptr<OnDiskBuffer>>
+Expected<std::unique_ptr<OnDiskBuffer>>
 OnDiskBuffer::create(StringRef Path, size_t Size, unsigned Mode) {
   // Create new file in same directory but with random name.
   SmallString<128> TempPath;
   int FD;
   if (auto EC = fs::createUniqueFile(Path + ".tmp%%%%%%%", FD, TempPath, Mode))
-    return EC;
+    return errorCodeToError(EC);
 
   sys::RemoveFileOnSignal(TempPath);
 
@@ -128,7 +128,7 @@ OnDiskBuffer::create(StringRef Path, size_t Size, unsigned Mode) {
   // pretty slow just like it writes specified amount of bytes,
   // so we should avoid calling that function.
   if (auto EC = fs::resize_file(FD, Size))
-    return EC;
+    return errorCodeToError(EC);
 #endif
 
   // Mmap it.
@@ -137,12 +137,12 @@ OnDiskBuffer::create(StringRef Path, size_t Size, unsigned Mode) {
       FD, fs::mapped_file_region::readwrite, Size, 0, EC);
   close(FD);
   if (EC)
-    return EC;
+    return errorCodeToError(EC);
   return llvm::make_unique<OnDiskBuffer>(Path, TempPath, std::move(MappedFile));
 }
 
 // Create an instance of FileOutputBuffer.
-ErrorOr<std::unique_ptr<FileOutputBuffer>>
+Expected<std::unique_ptr<FileOutputBuffer>>
 FileOutputBuffer::create(StringRef Path, size_t Size, unsigned Flags) {
   unsigned Mode = fs::all_read | fs::all_write;
   if (Flags & F_executable)
@@ -161,7 +161,7 @@ FileOutputBuffer::create(StringRef Path, size_t Size, unsigned Flags) {
   // destination file and write to it on commit().
   switch (Stat.type()) {
   case fs::file_type::directory_file:
-    return errc::is_a_directory;
+    return errorCodeToError(errc::is_a_directory);
   case fs::file_type::regular_file:
   case fs::file_type::file_not_found:
   case fs::file_type::status_error:
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index d8fb3e1dc1d..5b2a0f1d0c2 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -351,12 +351,14 @@ enum ProcessorTypes {
   INTEL_PENTIUM_IV,
   INTEL_PENTIUM_M,
   INTEL_CORE_DUO,
-  INTEL_X86_64,
   INTEL_NOCONA,
   INTEL_PRESCOTT,
   AMD_i486,
   AMDPENTIUM,
-  AMDATHLON,
+  AMD_ATHLON,
+  AMD_ATHLON_XP,
+  AMD_K8,
+  AMD_K8SSE3,
   INTEL_GOLDMONT,
   CPU_TYPE_MAX
 };
@@ -385,10 +387,6 @@ enum ProcessorSubtypes {
   AMDPENTIUM_K62,
   AMDPENTIUM_K63,
   AMDPENTIUM_GEODE,
-  AMDATHLON_CLASSIC,
-  AMDATHLON_XP,
-  AMDATHLON_K8,
-  AMDATHLON_K8SSE3,
   CPU_SUBTYPE_MAX
 };
 
@@ -794,8 +792,13 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
         break;
       }
       if (Features2 & (1 << (FEATURE_EM64T - 32))) {
-        *Type = INTEL_X86_64;
-        break; // x86-64
+        *Type = INTEL_CORE2; // "core2"
+        *Subtype = INTEL_CORE2_65;
+        break;
+      }
+      if (Features & (1 << FEATURE_SSE3)) {
+        *Type = INTEL_CORE_DUO;
+        break;
       }
       if (Features & (1 << FEATURE_SSE2)) {
         *Type = INTEL_PENTIUM_M;
@@ -814,40 +817,15 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     }
     break;
   case 15: {
-    switch (Model) {
-    case 0: // Pentium 4 processor, Intel Xeon processor. All processors are
-            // model 00h and manufactured using the 0.18 micron process.
-    case 1: // Pentium 4 processor, Intel Xeon processor, Intel Xeon
-            // processor MP, and Intel Celeron processor. All processors are
-            // model 01h and manufactured using the 0.18 micron process.
-    case 2: // Pentium 4 processor, Mobile Intel Pentium 4 processor - M,
-            // Intel Xeon processor, Intel Xeon processor MP, Intel Celeron
-            // processor, and Mobile Intel Celeron processor. All processors
-            // are model 02h and manufactured using the 0.13 micron process.
-      *Type = ((Features2 & (1 << (FEATURE_EM64T - 32))) ? INTEL_X86_64
-                                                         : INTEL_PENTIUM_IV);
+    if (Features2 & (1 << (FEATURE_EM64T - 32))) {
+      *Type = INTEL_NOCONA;
       break;
-
-    case 3: // Pentium 4 processor, Intel Xeon processor, Intel Celeron D
-            // processor. All processors are model 03h and manufactured using
-            // the 90 nm process.
-    case 4: // Pentium 4 processor, Pentium 4 processor Extreme Edition,
-            // Pentium D processor, Intel Xeon processor, Intel Xeon
-            // processor MP, Intel Celeron D processor. All processors are
-            // model 04h and manufactured using the 90 nm process.
-    case 6: // Pentium 4 processor, Pentium D processor, Pentium processor
-            // Extreme Edition, Intel Xeon processor, Intel Xeon processor
-            // MP, Intel Celeron D processor. All processors are model 06h
-            // and manufactured using the 65 nm process.
-      *Type = ((Features2 & (1 << (FEATURE_EM64T - 32))) ? INTEL_NOCONA
-                                                         : INTEL_PRESCOTT);
-      break;
-
-    default:
-      *Type = ((Features2 & (1 << (FEATURE_EM64T - 32))) ? INTEL_X86_64
-                                                         : INTEL_PENTIUM_IV);
+    }
+    if (Features & (1 << FEATURE_SSE3)) {
+      *Type = INTEL_PRESCOTT;
       break;
     }
+    *Type = INTEL_PENTIUM_IV;
     break;
   }
   default:
@@ -885,20 +863,18 @@ static void getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     }
     break;
   case 6:
-    *Type = AMDATHLON;
     if (Features & (1 << FEATURE_SSE)) {
-      *Subtype = AMDATHLON_XP;
+      *Type = AMD_ATHLON_XP;
       break; // "athlon-xp"
     }
-    *Subtype = AMDATHLON_CLASSIC;
+    *Type = AMD_ATHLON;
     break; // "athlon"
   case 15:
-    *Type = AMDATHLON;
     if (Features & (1 << FEATURE_SSE3)) {
-      *Subtype = AMDATHLON_K8SSE3;
+      *Type = AMD_K8SSE3;
       break; // "k8-sse3"
     }
-    *Subtype = AMDATHLON_K8;
+    *Type = AMD_K8;
     break; // "k8"
   case 16:
     *Type = AMDFAM10H; // "amdfam10"
@@ -1078,8 +1054,8 @@ StringRef sys::getHostCPUName() {
   detectX86FamilyModel(EAX, &Family, &Model);
   getAvailableFeatures(ECX, EDX, MaxLeaf, &Features, &Features2);
 
-  unsigned Type;
-  unsigned Subtype;
+  unsigned Type = 0;
+  unsigned Subtype = 0;
 
   if (Vendor == SIG_INTEL) {
     getIntelProcessorTypeAndSubtype(Family, Model, Brand_id, Features,
@@ -1145,8 +1121,6 @@ StringRef sys::getHostCPUName() {
       return "knl";
     case INTEL_KNM:
       return "knm";
-    case INTEL_X86_64:
-      return "x86-64";
     case INTEL_NOCONA:
       return "nocona";
     case INTEL_PRESCOTT:
@@ -1172,19 +1146,14 @@ StringRef sys::getHostCPUName() {
       default:
         return "pentium";
       }
-    case AMDATHLON:
-      switch (Subtype) {
-      case AMDATHLON_CLASSIC:
-        return "athlon";
-      case AMDATHLON_XP:
-        return "athlon-xp";
-      case AMDATHLON_K8:
-        return "k8";
-      case AMDATHLON_K8SSE3:
-        return "k8-sse3";
-      default:
-        llvm_unreachable("Unexpected subtype!");
-      }
+    case AMD_ATHLON:
+      return "athlon";
+    case AMD_ATHLON_XP:
+      return "athlon-xp";
+    case AMD_K8:
+      return "k8";
+    case AMD_K8SSE3:
+      return "k8-sse3";
     case AMDFAM10H:
       return "amdfam10";
     case AMD_BTVER1:
diff --git a/lib/Support/LowLevelType.cpp b/lib/Support/LowLevelType.cpp
index 0ee3f1d0119..cb2187405d6 100644
--- a/lib/Support/LowLevelType.cpp
+++ b/lib/Support/LowLevelType.cpp
@@ -43,7 +43,7 @@ void LLT::print(raw_ostream &OS) const {
     assert(isScalar() && "unexpected type");
     OS << "s" << getScalarSizeInBits();
   } else
-    llvm_unreachable("trying to print an invalid type");
+    OS << "LLT_invalid";
 }
 
 const constexpr LLT::BitFieldInfo LLT::ScalarSizeFieldInfo;
diff --git a/lib/Support/SpecialCaseList.cpp b/lib/Support/SpecialCaseList.cpp
index a659a2afee6..bf807e66e02 100644
--- a/lib/Support/SpecialCaseList.cpp
+++ b/lib/Support/SpecialCaseList.cpp
@@ -27,6 +27,7 @@
 namespace llvm {
 
 bool SpecialCaseList::Matcher::insert(std::string Regexp,
+                                      unsigned LineNumber,
                                       std::string &REError) {
   if (Regexp.empty()) {
     REError = "Supplied regexp was blank";
@@ -34,7 +35,7 @@ bool SpecialCaseList::Matcher::insert(std::string Regexp,
   }
 
   if (Regex::isLiteralERE(Regexp)) {
-    Strings.insert(Regexp);
+    Strings[Regexp] = LineNumber;
     return true;
   }
   Trigrams.insert(Regexp);
@@ -45,34 +46,30 @@ bool SpecialCaseList::Matcher::insert(std::string Regexp,
     Regexp.replace(pos, strlen("*"), ".*");
   }
 
+  Regexp = (Twine("^(") + StringRef(Regexp) + ")$").str();
+
   // Check that the regexp is valid.
   Regex CheckRE(Regexp);
   if (!CheckRE.isValid(REError))
     return false;
 
-  if (!UncompiledRegEx.empty())
-    UncompiledRegEx += "|";
-  UncompiledRegEx += "^(" + Regexp + ")$";
+  RegExes.emplace_back(
+      std::make_pair(make_unique<Regex>(std::move(CheckRE)), LineNumber));
   return true;
 }
 
-void SpecialCaseList::Matcher::compile() {
-  if (!UncompiledRegEx.empty()) {
-    RegEx.reset(new Regex(UncompiledRegEx));
-    UncompiledRegEx.clear();
-  }
-}
-
-bool SpecialCaseList::Matcher::match(StringRef Query) const {
-  if (Strings.count(Query))
-    return true;
+unsigned SpecialCaseList::Matcher::match(StringRef Query) const {
+  auto It = Strings.find(Query);
+  if (It != Strings.end())
+    return It->second;
   if (Trigrams.isDefinitelyOut(Query))
     return false;
-  return RegEx && RegEx->match(Query);
+  for (auto& RegExKV : RegExes)
+    if (RegExKV.first->match(Query))
+      return RegExKV.second;
+  return 0;
 }
 
-SpecialCaseList::SpecialCaseList() : Sections(), IsCompiled(false) {}
-
 std::unique_ptr<SpecialCaseList>
 SpecialCaseList::create(const std::vector<std::string> &Paths,
                         std::string &Error) {
@@ -114,7 +111,6 @@ bool SpecialCaseList::createInternal(const std::vector<std::string> &Paths,
       return false;
     }
   }
-  compile();
   return true;
 }
 
@@ -123,7 +119,6 @@ bool SpecialCaseList::createInternal(const MemoryBuffer *MB,
   StringMap<size_t> Sections;
   if (!parse(MB, Sections, Error))
     return false;
-  compile();
   return true;
 }
 
@@ -132,11 +127,13 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB,
                             std::string &Error) {
   // Iterate through each line in the blacklist file.
   SmallVector<StringRef, 16> Lines;
-  SplitString(MB->getBuffer(), Lines, "\n\r");
+  MB->getBuffer().split(Lines, '\n');
 
-  int LineNo = 1;
+  unsigned LineNo = 1;
   StringRef Section = "*";
+
   for (auto I = Lines.begin(), E = Lines.end(); I != E; ++I, ++LineNo) {
+    *I = I->trim();
     // Ignore empty lines and lines starting with "#"
     if (I->empty() || I->startswith("#"))
       continue;
@@ -181,11 +178,10 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB,
     if (SectionsMap.find(Section) == SectionsMap.end()) {
       std::unique_ptr<Matcher> M = make_unique<Matcher>();
       std::string REError;
-      if (!M->insert(Section, REError)) {
+      if (!M->insert(Section, LineNo, REError)) {
         Error = (Twine("malformed section ") + Section + ": '" + REError).str();
         return false;
       }
-      M->compile();
 
       SectionsMap[Section] = Sections.size();
       Sections.emplace_back(std::move(M));
@@ -193,7 +189,7 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB,
 
     auto &Entry = Sections[SectionsMap[Section]].Entries[Prefix][Category];
     std::string REError;
-    if (!Entry.insert(std::move(Regexp), REError)) {
+    if (!Entry.insert(std::move(Regexp), LineNo, REError)) {
       Error = (Twine("malformed regex in line ") + Twine(LineNo) + ": '" +
                SplitLine.second + "': " + REError).str();
       return false;
@@ -202,38 +198,33 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB,
   return true;
 }
 
-void SpecialCaseList::compile() {
-  assert(!IsCompiled && "compile() should only be called once");
-  // Iterate through every section compiling regular expressions for every query
-  // and creating Section entries.
-  for (auto &Section : Sections)
-    for (auto &Prefix : Section.Entries)
-      for (auto &Category : Prefix.getValue())
-        Category.getValue().compile();
-
-  IsCompiled = true;
-}
-
 SpecialCaseList::~SpecialCaseList() {}
 
 bool SpecialCaseList::inSection(StringRef Section, StringRef Prefix,
                                 StringRef Query, StringRef Category) const {
-  assert(IsCompiled && "SpecialCaseList::compile() was not called!");
+  return inSectionBlame(Section, Prefix, Query, Category);
+}
 
+unsigned SpecialCaseList::inSectionBlame(StringRef Section, StringRef Prefix,
+                                         StringRef Query,
+                                         StringRef Category) const {
   for (auto &SectionIter : Sections)
-    if (SectionIter.SectionMatcher->match(Section) &&
-        inSection(SectionIter.Entries, Prefix, Query, Category))
-      return true;
-
-  return false;
+    if (SectionIter.SectionMatcher->match(Section)) {
+      unsigned Blame =
+          inSectionBlame(SectionIter.Entries, Prefix, Query, Category);
+      if (Blame)
+        return Blame;
+    }
+  return 0;
 }
 
-bool SpecialCaseList::inSection(const SectionEntries &Entries, StringRef Prefix,
-                                StringRef Query, StringRef Category) const {
+unsigned SpecialCaseList::inSectionBlame(const SectionEntries &Entries,
+                                         StringRef Prefix, StringRef Query,
+                                         StringRef Category) const {
   SectionEntries::const_iterator I = Entries.find(Prefix);
-  if (I == Entries.end()) return false;
+  if (I == Entries.end()) return 0;
   StringMap<Matcher>::const_iterator II = I->second.find(Category);
-  if (II == I->second.end()) return false;
+  if (II == I->second.end()) return 0;
 
   return II->getValue().match(Query);
 }
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index 781a911ed57..2ecb97316c8 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -426,7 +426,7 @@ std::error_code resize_file(int FD, uint64_t Size) {
   // If we have posix_fallocate use it. Unlike ftruncate it always allocates
   // space, so we get an error if the disk is full.
   if (int Err = ::posix_fallocate(FD, 0, Size)) {
-    if (Err != EOPNOTSUPP)
+    if (Err != EINVAL && Err != EOPNOTSUPP)
       return std::error_code(Err, std::generic_category());
   }
 #endif
diff --git a/lib/Target/AArch64/AArch64A53Fix835769.cpp b/lib/Target/AArch64/AArch64A53Fix835769.cpp
index e6afb42440a..7de5d0ef66b 100644
--- a/lib/Target/AArch64/AArch64A53Fix835769.cpp
+++ b/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -22,9 +22,9 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h
index bc44bc5f246..461c01318d4 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.h
+++ b/lib/Target/AArch64/AArch64CallingConvention.h
@@ -19,8 +19,8 @@
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/CallingConv.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 namespace {
 using namespace llvm;
diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp
index 51700f90597..2793481a0c1 100644
--- a/lib/Target/AArch64/AArch64CondBrTuning.cpp
+++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -34,9 +34,9 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineTraceMetrics.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index 8bbef44a2e6..c3a354cb01d 100644
--- a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -73,11 +73,11 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <cassert>
 #include <cstdlib>
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 9eda56c825a..33e0f5de5fd 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -31,10 +31,10 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineTraceMetrics.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 5d8b4b69593..d182c812189 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -20,9 +20,9 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 8ebcaff1358..809e4a77fad 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -110,6 +110,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
@@ -121,7 +122,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index c351efb0c39..55a256867fa 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -14,7 +14,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
 
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
 
diff --git a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
index 7d2cfbeff38..39f50ade747 100644
--- a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
+++ b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -14,19 +14,21 @@
 namespace llvm {
 RegisterBankInfo::PartialMapping AArch64GenRegisterBankInfo::PartMappings[]{
     /* StartIdx, Length, RegBank */
-    // 0: FPR 32-bit value.
+    // 0: FPR 16-bit value.
+    {0, 16, AArch64::FPRRegBank},
+    // 1: FPR 32-bit value.
     {0, 32, AArch64::FPRRegBank},
-    // 1: FPR 64-bit value.
+    // 2: FPR 64-bit value.
     {0, 64, AArch64::FPRRegBank},
-    // 2: FPR 128-bit value.
+    // 3: FPR 128-bit value.
     {0, 128, AArch64::FPRRegBank},
-    // 3: FPR 256-bit value.
+    // 4: FPR 256-bit value.
     {0, 256, AArch64::FPRRegBank},
-    // 4: FPR 512-bit value.
+    // 5: FPR 512-bit value.
     {0, 512, AArch64::FPRRegBank},
-    // 5: GPR 32-bit value.
+    // 6: GPR 32-bit value.
     {0, 32, AArch64::GPRRegBank},
-    // 6: GPR 64-bit value.
+    // 7: GPR 64-bit value.
     {0, 64, AArch64::GPRRegBank},
 };
 
@@ -37,58 +39,77 @@ RegisterBankInfo::ValueMapping AArch64GenRegisterBankInfo::ValMappings[]{
     {nullptr, 0},
     // 3-operands instructions (all binary operations should end up with one of
     // those mapping).
-    // 1: FPR 32-bit value. <-- This must match First3OpsIdx.
+    // 1: FPR 16-bit value. <-- This must match First3OpsIdx.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1},
+    // 4: FPR 32-bit value. <-- This must match First3OpsIdx.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
-    // 4: FPR 64-bit value.
+    // 7: FPR 64-bit value.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
-    // 7: FPR 128-bit value.
+    // 10: FPR 128-bit value.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
-    // 10: FPR 256-bit value.
+    // 13: FPR 256-bit value.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
-    // 13: FPR 512-bit value.
+    // 16: FPR 512-bit value.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
-    // 16: GPR 32-bit value.
+    // 19: GPR 32-bit value.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
-    // 19: GPR 64-bit value. <-- This must match Last3OpsIdx.
+    // 22: GPR 64-bit value. <-- This must match Last3OpsIdx.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
     // Cross register bank copies.
-    // 22: FPR 32-bit value to GPR 32-bit value. <-- This must match
+    // 25: FPR 16-bit value to GPR 16-bit (invalid). <-- This must match
     //                                               FirstCrossRegCpyIdx.
+    {nullptr, 1},
+    {nullptr, 1},
+    // 27: FPR 32-bit value to GPR 32-bit value.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
-    // 24: FPR 64-bit value to GPR 64-bit value.
+    // 29: FPR 64-bit value to GPR 64-bit value.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
-    // 26: FPR 128-bit value to GPR 128-bit value (invalid)
+    // 31: FPR 128-bit value to GPR 128-bit value (invalid)
     {nullptr, 1},
     {nullptr, 1},
-    // 28: FPR 256-bit value to GPR 256-bit value (invalid)
+    // 33: FPR 256-bit value to GPR 256-bit value (invalid)
     {nullptr, 1},
     {nullptr, 1},
-    // 30: FPR 512-bit value to GPR 512-bit value (invalid)
+    // 35: FPR 512-bit value to GPR 512-bit value (invalid)
     {nullptr, 1},
     {nullptr, 1},
-    // 32: GPR 32-bit value to FPR 32-bit value.
+    // 37: GPR 32-bit value to FPR 32-bit value.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
-    // 34: GPR 64-bit value to FPR 64-bit value. <-- This must match
+    // 39: GPR 64-bit value to FPR 64-bit value. <-- This must match
     //                                               LastCrossRegCpyIdx.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+    // 41: FPExt: 16 to 32. <-- This must match FPExt16To32Idx.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1},
+    // 43: FPExt: 16 to 32. <-- This must match FPExt16To64Idx.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1},
+    // 45: FPExt: 32 to 64. <-- This must match FPExt32To64Idx.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+    // 47: FPExt vector: 64 to 128. <-- This must match FPExt64To128Idx.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
 };
 
 bool AArch64GenRegisterBankInfo::checkPartialMap(unsigned Idx,
@@ -145,16 +166,18 @@ unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx,
     return -1;
   }
   if (RBIdx == PMI_FirstFPR) {
-    if (Size <= 32)
+    if (Size <= 16)
       return 0;
-    if (Size <= 64)
+    if (Size <= 32)
       return 1;
-    if (Size <= 128)
+    if (Size <= 64)
       return 2;
-    if (Size <= 256)
+    if (Size <= 128)
       return 3;
-    if (Size <= 512)
+    if (Size <= 256)
       return 4;
+    if (Size <= 512)
+      return 5;
     return -1;
   }
   return -1;
@@ -206,4 +229,35 @@ AArch64GenRegisterBankInfo::getCopyMapping(unsigned DstBankID,
          ValMappingIdx <= LastCrossRegCpyIdx && "Mapping out of bound");
   return &ValMappings[ValMappingIdx];
 }
+
+const RegisterBankInfo::ValueMapping *
+AArch64GenRegisterBankInfo::getFPExtMapping(unsigned DstSize,
+                                         unsigned SrcSize) {
+  // We support:
+  // - For Scalar:
+  //   - 16 to 32.
+  //   - 16 to 64.
+  //   - 32 to 64.
+  // => FPR 16 to FPR 32|64
+  // => FPR 32 to FPR 64
+  // - For vectors:
+  //   - v4f16 to v4f32
+  //   - v2f32 to v2f64
+  // => FPR 64 to FPR 128
+
+  // Check that we have been asked sensible sizes.
+  if (SrcSize == 16) {
+    assert((DstSize == 32 || DstSize == 64) && "Unexpected half extension");
+    if (DstSize == 32)
+      return &ValMappings[FPExt16To32Idx];
+    return &ValMappings[FPExt16To64Idx];
+  }
+
+  if (SrcSize == 32) {
+    assert(DstSize == 64 && "Unexpected float extension");
+    return &ValMappings[FPExt32To64Idx];
+  }
+  assert((SrcSize == 64 || DstSize == 128) && "Unexpected vector extension");
+  return &ValMappings[FPExt64To128Idx];
+}
 } // End llvm namespace.
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index bec872ae8c0..81dac1be56c 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -42,6 +42,7 @@
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
@@ -71,7 +72,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetCallingConv.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <algorithm>
@@ -4981,7 +4981,7 @@ static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
       // the initial estimate is 2^-8.  Thus the number of extra steps to refine
       // the result for float (23 mantissa bits) is 2 and for double (52
       // mantissa bits) is 3.
-      ExtraSteps = VT == MVT::f64 ? 3 : 2;
+      ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
 
     return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
   }
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 24758e97888..2f10bef1e47 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -17,7 +17,7 @@
 #include "AArch64.h"
 #include "AArch64RegisterInfo.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "AArch64GenInstrInfo.inc"
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index eabbc05a033..e014d5bd569 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -335,6 +335,7 @@ let RecomputePerFunction = 1 in {
 }
 
 include "AArch64InstrFormats.td"
+include "SVEInstrFormats.td"
 
 //===----------------------------------------------------------------------===//
 
@@ -6275,3 +6276,4 @@ def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
           (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
 
 include "AArch64InstrAtomics.td"
+include "AArch64SVEInstrInfo.td"
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 2d45be37ca7..3a456255224 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -23,6 +23,110 @@
 
 using namespace llvm;
 
+/// FIXME: The following static functions are SizeChangeStrategy functions
+/// that are meant to temporarily mimic the behaviour of the old legalization
+/// based on doubling/halving non-legal types as closely as possible. This is
+/// not entirly possible as only legalizing the types that are exactly a power
+/// of 2 times the size of the legal types would require specifying all those
+/// sizes explicitly.
+/// In practice, not specifying those isn't a problem, and the below functions
+/// should disappear quickly as we add support for legalizing non-power-of-2
+/// sized types further.
+static void
+addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result,
+                                const LegalizerInfo::SizeAndActionsVec &v) {
+  for (unsigned i = 0; i < v.size(); ++i) {
+    result.push_back(v[i]);
+    if (i + 1 < v[i].first && i + 1 < v.size() &&
+        v[i + 1].first != v[i].first + 1)
+      result.push_back({v[i].first + 1, LegalizerInfo::Unsupported});
+  }
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_1_narrow_128_ToLargest(const LegalizerInfo::SizeAndActionsVec &v) {
+  assert(v.size() >= 1);
+  assert(v[0].first > 2);
+  LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::WidenScalar},
+                                             {2, LegalizerInfo::Unsupported}};
+  addAndInterleaveWithUnsupported(result, v);
+  auto Largest = result.back().first;
+  assert(Largest + 1 < 128);
+  result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+  result.push_back({128, LegalizerInfo::NarrowScalar});
+  result.push_back({129, LegalizerInfo::Unsupported});
+  return result;
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_16(const LegalizerInfo::SizeAndActionsVec &v) {
+  assert(v.size() >= 1);
+  assert(v[0].first > 17);
+  LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::Unsupported},
+                                             {16, LegalizerInfo::WidenScalar},
+                                             {17, LegalizerInfo::Unsupported}};
+  addAndInterleaveWithUnsupported(result, v);
+  auto Largest = result.back().first;
+  result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+  return result;
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_1_8(const LegalizerInfo::SizeAndActionsVec &v) {
+  assert(v.size() >= 1);
+  assert(v[0].first > 9);
+  LegalizerInfo::SizeAndActionsVec result = {
+      {1, LegalizerInfo::WidenScalar},  {2, LegalizerInfo::Unsupported},
+      {8, LegalizerInfo::WidenScalar},  {9, LegalizerInfo::Unsupported}};
+  addAndInterleaveWithUnsupported(result, v);
+  auto Largest = result.back().first;
+  result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+  return result;
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_1_8_16(const LegalizerInfo::SizeAndActionsVec &v) {
+  assert(v.size() >= 1);
+  assert(v[0].first > 17);
+  LegalizerInfo::SizeAndActionsVec result = {
+      {1, LegalizerInfo::WidenScalar},  {2, LegalizerInfo::Unsupported},
+      {8, LegalizerInfo::WidenScalar},  {9, LegalizerInfo::Unsupported},
+      {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}};
+  addAndInterleaveWithUnsupported(result, v);
+  auto Largest = result.back().first;
+  result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+  return result;
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_1_8_16_narrowToLargest(const LegalizerInfo::SizeAndActionsVec &v) {
+  assert(v.size() >= 1);
+  assert(v[0].first > 17);
+  LegalizerInfo::SizeAndActionsVec result = {
+      {1, LegalizerInfo::WidenScalar},  {2, LegalizerInfo::Unsupported},
+      {8, LegalizerInfo::WidenScalar},  {9, LegalizerInfo::Unsupported},
+      {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}};
+  addAndInterleaveWithUnsupported(result, v);
+  auto Largest = result.back().first;
+  result.push_back({Largest + 1, LegalizerInfo::NarrowScalar});
+  return result;
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_1_8_16_32(const LegalizerInfo::SizeAndActionsVec &v) {
+  assert(v.size() >= 1);
+  assert(v[0].first > 33);
+  LegalizerInfo::SizeAndActionsVec result = {
+      {1, LegalizerInfo::WidenScalar},  {2, LegalizerInfo::Unsupported},
+      {8, LegalizerInfo::WidenScalar},  {9, LegalizerInfo::Unsupported},
+      {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported},
+      {32, LegalizerInfo::WidenScalar}, {33, LegalizerInfo::Unsupported}};
+  addAndInterleaveWithUnsupported(result, v);
+  auto Largest = result.back().first;
+  result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+  return result;
+}
+
 AArch64LegalizerInfo::AArch64LegalizerInfo() {
   using namespace TargetOpcode;
   const LLT p0 = LLT::pointer(0, 64);
@@ -42,8 +146,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
   for (auto Ty : {s16, s32, s64, p0})
     setAction({G_PHI, Ty}, Legal);
 
-  for (auto Ty : {s1, s8})
-    setAction({G_PHI, Ty}, WidenScalar);
+  setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1_8);
 
   for (auto Ty : { s32, s64 })
     setAction({G_BSWAP, Ty}, Legal);
@@ -54,15 +157,15 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     for (auto Ty : {s32, s64, v2s32, v4s32, v2s64})
       setAction({BinOp, Ty}, Legal);
 
-    for (auto Ty : {s1, s8, s16})
-      setAction({BinOp, Ty}, WidenScalar);
+    if (BinOp != G_ADD)
+      setLegalizeScalarToDifferentSizeStrategy(BinOp, 0,
+                                               widen_1_8_16_narrowToLargest);
   }
 
   setAction({G_GEP, p0}, Legal);
   setAction({G_GEP, 1, s64}, Legal);
 
-  for (auto Ty : {s1, s8, s16, s32})
-    setAction({G_GEP, 1, Ty}, WidenScalar);
+  setLegalizeScalarToDifferentSizeStrategy(G_GEP, 1, widen_1_8_16_32);
 
   setAction({G_PTR_MASK, p0}, Legal);
 
@@ -70,16 +173,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     for (auto Ty : {s32, s64})
       setAction({BinOp, Ty}, Legal);
 
-    for (auto Ty : {s1, s8, s16})
-      setAction({BinOp, Ty}, WidenScalar);
+    setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1_8_16);
   }
 
   for (unsigned BinOp : {G_SREM, G_UREM})
     for (auto Ty : { s1, s8, s16, s32, s64 })
       setAction({BinOp, Ty}, Lower);
 
-  for (unsigned Op : {G_SMULO, G_UMULO})
-      setAction({Op, s64}, Lower);
+  for (unsigned Op : {G_SMULO, G_UMULO}) {
+    setAction({Op, 0, s64}, Lower);
+    setAction({Op, 1, s1}, Legal);
+  }
 
   for (unsigned Op : {G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULH, G_UMULH}) {
     for (auto Ty : { s32, s64 })
@@ -101,8 +205,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     setAction({G_INSERT, Ty}, Legal);
     setAction({G_INSERT, 1, Ty}, Legal);
   }
+  setLegalizeScalarToDifferentSizeStrategy(G_INSERT, 0,
+                                           widen_1_8_16_narrowToLargest);
   for (auto Ty : {s1, s8, s16}) {
-    setAction({G_INSERT, Ty}, WidenScalar);
     setAction({G_INSERT, 1, Ty}, Legal);
     // FIXME: Can't widen the sources because that violates the constraints on
     // G_INSERT (It seems entirely reasonable that inputs shouldn't overlap).
@@ -118,7 +223,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     for (auto Ty : {s8, s16, s32, s64, p0, v2s32})
       setAction({MemOp, Ty}, Legal);
 
-    setAction({MemOp, s1}, WidenScalar);
+    setLegalizeScalarToDifferentSizeStrategy(MemOp, 0,
+                                             widen_1_narrow_128_ToLargest);
 
     // And everything's fine in addrspace 0.
     setAction({MemOp, 1, p0}, Legal);
@@ -132,20 +238,16 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
 
   setAction({G_CONSTANT, p0}, Legal);
 
-  for (auto Ty : {s1, s8, s16})
-    setAction({TargetOpcode::G_CONSTANT, Ty}, WidenScalar);
-
-  setAction({TargetOpcode::G_FCONSTANT, s16}, WidenScalar);
+  setLegalizeScalarToDifferentSizeStrategy(G_CONSTANT, 0, widen_1_8_16);
+  setLegalizeScalarToDifferentSizeStrategy(G_FCONSTANT, 0, widen_16);
 
   setAction({G_ICMP, 1, s32}, Legal);
   setAction({G_ICMP, 1, s64}, Legal);
   setAction({G_ICMP, 1, p0}, Legal);
 
-  for (auto Ty : {s1, s8, s16}) {
-    setAction({G_ICMP, Ty}, WidenScalar);
-    setAction({G_FCMP, Ty}, WidenScalar);
-    setAction({G_ICMP, 1, Ty}, WidenScalar);
-  }
+  setLegalizeScalarToDifferentSizeStrategy(G_ICMP, 0, widen_1_8_16);
+  setLegalizeScalarToDifferentSizeStrategy(G_FCMP, 0, widen_1_8_16);
+  setLegalizeScalarToDifferentSizeStrategy(G_ICMP, 1, widen_1_8_16);
 
   setAction({G_ICMP, s32}, Legal);
   setAction({G_FCMP, s32}, Legal);
@@ -159,12 +261,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     setAction({G_ANYEXT, Ty}, Legal);
   }
 
-  for (auto Ty : { s1, s8, s16, s32 }) {
-    setAction({G_ZEXT, 1, Ty}, Legal);
-    setAction({G_SEXT, 1, Ty}, Legal);
-    setAction({G_ANYEXT, 1, Ty}, Legal);
-  }
-
   // FP conversions
   for (auto Ty : { s16, s32 }) {
     setAction({G_FPTRUNC, Ty}, Legal);
@@ -176,12 +272,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     setAction({G_FPEXT, Ty}, Legal);
   }
 
-  for (auto Ty : { s1, s8, s16, s32 })
-    setAction({G_TRUNC, Ty}, Legal);
-
-  for (auto Ty : { s8, s16, s32, s64 })
-    setAction({G_TRUNC, 1, Ty}, Legal);
-
   // Conversions
   for (auto Ty : { s32, s64 }) {
     setAction({G_FPTOSI, 0, Ty}, Legal);
@@ -189,12 +279,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     setAction({G_SITOFP, 1, Ty}, Legal);
     setAction({G_UITOFP, 1, Ty}, Legal);
   }
-  for (auto Ty : { s1, s8, s16 }) {
-    setAction({G_FPTOSI, 0, Ty}, WidenScalar);
-    setAction({G_FPTOUI, 0, Ty}, WidenScalar);
-    setAction({G_SITOFP, 1, Ty}, WidenScalar);
-    setAction({G_UITOFP, 1, Ty}, WidenScalar);
-  }
+  setLegalizeScalarToDifferentSizeStrategy(G_FPTOSI, 0, widen_1_8_16);
+  setLegalizeScalarToDifferentSizeStrategy(G_FPTOUI, 0, widen_1_8_16);
+  setLegalizeScalarToDifferentSizeStrategy(G_SITOFP, 1, widen_1_8_16);
+  setLegalizeScalarToDifferentSizeStrategy(G_UITOFP, 1, widen_1_8_16);
 
   for (auto Ty : { s32, s64 }) {
     setAction({G_FPTOSI, 1, Ty}, Legal);
@@ -209,8 +297,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
   setAction({G_BRINDIRECT, p0}, Legal);
 
   // Select
-  for (auto Ty : {s1, s8, s16})
-    setAction({G_SELECT, Ty}, WidenScalar);
+  setLegalizeScalarToDifferentSizeStrategy(G_SELECT, 0, widen_1_8_16);
 
   for (auto Ty : {s32, s64, p0})
     setAction({G_SELECT, Ty}, Legal);
diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp
index bd5211d4ff5..bd4bdaa6d12 100644
--- a/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -15,7 +15,7 @@
 #include "AArch64MacroFusion.h"
 #include "AArch64Subtarget.h"
 #include "llvm/CodeGen/MacroFusion.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index 391e8ed633d..83bf493c9f0 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -87,9 +87,9 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
   assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR,
                                 {PMI_GPR32, PMI_GPR64}) &&
          "PartialMappingIdx's are incorrectly ordered");
-  assert(checkPartialMappingIdx(
-             PMI_FirstFPR, PMI_LastFPR,
-             {PMI_FPR32, PMI_FPR64, PMI_FPR128, PMI_FPR256, PMI_FPR512}) &&
+  assert(checkPartialMappingIdx(PMI_FirstFPR, PMI_LastFPR,
+                                {PMI_FPR16, PMI_FPR32, PMI_FPR64, PMI_FPR128,
+                                 PMI_FPR256, PMI_FPR512}) &&
          "PartialMappingIdx's are incorrectly ordered");
 // Now, the content.
 // Check partial mapping.
@@ -102,6 +102,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
 
   CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR);
   CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR);
+  CHECK_PARTIALMAP(PMI_FPR16, 0, 16, RBFPR);
   CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR);
   CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR);
   CHECK_PARTIALMAP(PMI_FPR128, 0, 128, RBFPR);
@@ -121,6 +122,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
 
   CHECK_VALUEMAP(GPR, 32);
   CHECK_VALUEMAP(GPR, 64);
+  CHECK_VALUEMAP(FPR, 16);
   CHECK_VALUEMAP(FPR, 32);
   CHECK_VALUEMAP(FPR, 64);
   CHECK_VALUEMAP(FPR, 128);
@@ -173,6 +175,30 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
   CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 64);
   CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 64);
 
+#define CHECK_VALUEMAP_FPEXT(DstSize, SrcSize)                                 \
+  do {                                                                         \
+    unsigned PartialMapDstIdx = PMI_FPR##DstSize - PMI_Min;                    \
+    unsigned PartialMapSrcIdx = PMI_FPR##SrcSize - PMI_Min;                    \
+    (void)PartialMapDstIdx;                                                    \
+    (void)PartialMapSrcIdx;                                                    \
+    const ValueMapping *Map = getFPExtMapping(DstSize, SrcSize);               \
+    (void)Map;                                                                 \
+    assert(Map[0].BreakDown ==                                                 \
+               &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] &&  \
+           Map[0].NumBreakDowns == 1 && "FPR" #DstSize                         \
+                                        " Dst is incorrectly initialized");    \
+    assert(Map[1].BreakDown ==                                                 \
+               &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] &&  \
+           Map[1].NumBreakDowns == 1 && "FPR" #SrcSize                         \
+                                        " Src is incorrectly initialized");    \
+                                                                               \
+  } while (false)
+
+  CHECK_VALUEMAP_FPEXT(32, 16);
+  CHECK_VALUEMAP_FPEXT(64, 16);
+  CHECK_VALUEMAP_FPEXT(64, 32);
+  CHECK_VALUEMAP_FPEXT(128, 64);
+
   assert(verify(TRI) && "Invalid register bank information");
 }
 
@@ -453,6 +479,14 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case TargetOpcode::G_FMUL:
   case TargetOpcode::G_FDIV:
     return getSameKindOfOperandsMapping(MI);
+  case TargetOpcode::G_FPEXT: {
+    LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+    LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+    return getInstructionMapping(
+        DefaultMappingID, /*Cost*/ 1,
+        getFPExtMapping(DstTy.getSizeInBits(), SrcTy.getSizeInBits()),
+        /*NumOperands*/ 2);
+  }
   case TargetOpcode::COPY: {
     unsigned DstReg = MI.getOperand(0).getReg();
     unsigned SrcReg = MI.getOperand(1).getReg();
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.h b/lib/Target/AArch64/AArch64RegisterBankInfo.h
index 6d74a47095a..008221dbef5 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.h
@@ -25,10 +25,10 @@ class TargetRegisterInfo;
 
 class AArch64GenRegisterBankInfo : public RegisterBankInfo {
 protected:
-
   enum PartialMappingIdx {
     PMI_None = -1,
-    PMI_FPR32 = 1,
+    PMI_FPR16 = 1,
+    PMI_FPR32,
     PMI_FPR64,
     PMI_FPR128,
     PMI_FPR256,
@@ -37,7 +37,7 @@ protected:
     PMI_GPR64,
     PMI_FirstGPR = PMI_GPR32,
     PMI_LastGPR = PMI_GPR64,
-    PMI_FirstFPR = PMI_FPR32,
+    PMI_FirstFPR = PMI_FPR16,
     PMI_LastFPR = PMI_FPR512,
     PMI_Min = PMI_FirstFPR,
   };
@@ -49,11 +49,15 @@ protected:
   enum ValueMappingIdx {
     InvalidIdx = 0,
     First3OpsIdx = 1,
-    Last3OpsIdx = 19,
+    Last3OpsIdx = 22,
     DistanceBetweenRegBanks = 3,
-    FirstCrossRegCpyIdx = 22,
-    LastCrossRegCpyIdx = 34,
-    DistanceBetweenCrossRegCpy = 2
+    FirstCrossRegCpyIdx = 25,
+    LastCrossRegCpyIdx = 39,
+    DistanceBetweenCrossRegCpy = 2,
+    FPExt16To32Idx = 41,
+    FPExt16To64Idx = 43,
+    FPExt32To64Idx = 45,
+    FPExt64To128Idx = 47,
   };
 
   static bool checkPartialMap(unsigned Idx, unsigned ValStartIdx,
@@ -82,6 +86,15 @@ protected:
   static const RegisterBankInfo::ValueMapping *
   getCopyMapping(unsigned DstBankID, unsigned SrcBankID, unsigned Size);
 
+  /// Get the instruction mapping for G_FPEXT.
+  ///
+  /// \pre (DstSize, SrcSize) pair is one of the following:
+  ///      (32, 16), (64, 16), (64, 32), (128, 64)
+  ///
+  /// \return An InstructionMapping with statically allocated OperandsMapping.
+  static const RegisterBankInfo::ValueMapping *
+  getFPExtMapping(unsigned DstSize, unsigned SrcSize);
+
 #define GET_TARGET_REGBANK_CLASS
 #include "AArch64GenRegisterBank.inc"
 };
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 91b1481f5ef..1059bc37c8f 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -26,7 +26,7 @@
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetOptions.h"
 
 using namespace llvm;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index ee5d3547aaa..a9fb0200d80 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -32,6 +32,12 @@ let Namespace = "AArch64" in {
   def qsub : SubRegIndex<64>;
   def sube64 : SubRegIndex<64>;
   def subo64 : SubRegIndex<64>;
+  // SVE
+  def zsub    : SubRegIndex<128>;
+  // Note: zsub_hi should never be used directly because it represents
+  // the scalable part of the SVE vector and cannot be manipulated as a
+  // subvector in the same way the lower 128bits can.
+  def zsub_hi : SubRegIndex<128>;
   // Note: Code depends on these having consecutive numbers
   def dsub0 : SubRegIndex<64>;
   def dsub1 : SubRegIndex<64>;
@@ -460,11 +466,11 @@ def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> {
 // assmebler matching.
 def VectorReg64AsmOperand : AsmOperandClass {
   let Name = "VectorReg64";
-  let PredicateMethod = "isVectorReg";
+  let PredicateMethod = "isNeonVectorReg";
 }
 def VectorReg128AsmOperand : AsmOperandClass {
   let Name = "VectorReg128";
-  let PredicateMethod = "isVectorReg";
+  let PredicateMethod = "isNeonVectorReg";
 }
 
 def V64  : RegisterOperand<FPR64, "printVRegOperand"> {
@@ -475,7 +481,10 @@ def V128 : RegisterOperand<FPR128, "printVRegOperand"> {
   let ParserMatchClass = VectorReg128AsmOperand;
 }
 
-def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; }
+def VectorRegLoAsmOperand : AsmOperandClass {
+  let Name = "VectorRegLo";
+  let PredicateMethod = "isNeonVectorRegLo";
+}
 def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
   let ParserMatchClass = VectorRegLoAsmOperand;
 }
@@ -642,3 +651,119 @@ def XSeqPairClassOperand :
 
 
 //===----- END: v8.1a atomic CASP register operands -----------------------===//
+
+// The part of SVE registers that don't overlap Neon registers.
+// These are only used as part of clobber lists.
+def Z0_HI    : AArch64Reg<0,   "z0_hi">;
+def Z1_HI    : AArch64Reg<1,   "z1_hi">;
+def Z2_HI    : AArch64Reg<2,   "z2_hi">;
+def Z3_HI    : AArch64Reg<3,   "z3_hi">;
+def Z4_HI    : AArch64Reg<4,   "z4_hi">;
+def Z5_HI    : AArch64Reg<5,   "z5_hi">;
+def Z6_HI    : AArch64Reg<6,   "z6_hi">;
+def Z7_HI    : AArch64Reg<7,   "z7_hi">;
+def Z8_HI    : AArch64Reg<8,   "z8_hi">;
+def Z9_HI    : AArch64Reg<9,   "z9_hi">;
+def Z10_HI   : AArch64Reg<10, "z10_hi">;
+def Z11_HI   : AArch64Reg<11, "z11_hi">;
+def Z12_HI   : AArch64Reg<12, "z12_hi">;
+def Z13_HI   : AArch64Reg<13, "z13_hi">;
+def Z14_HI   : AArch64Reg<14, "z14_hi">;
+def Z15_HI   : AArch64Reg<15, "z15_hi">;
+def Z16_HI   : AArch64Reg<16, "z16_hi">;
+def Z17_HI   : AArch64Reg<17, "z17_hi">;
+def Z18_HI   : AArch64Reg<18, "z18_hi">;
+def Z19_HI   : AArch64Reg<19, "z19_hi">;
+def Z20_HI   : AArch64Reg<20, "z20_hi">;
+def Z21_HI   : AArch64Reg<21, "z21_hi">;
+def Z22_HI   : AArch64Reg<22, "z22_hi">;
+def Z23_HI   : AArch64Reg<23, "z23_hi">;
+def Z24_HI   : AArch64Reg<24, "z24_hi">;
+def Z25_HI   : AArch64Reg<25, "z25_hi">;
+def Z26_HI   : AArch64Reg<26, "z26_hi">;
+def Z27_HI   : AArch64Reg<27, "z27_hi">;
+def Z28_HI   : AArch64Reg<28, "z28_hi">;
+def Z29_HI   : AArch64Reg<29, "z29_hi">;
+def Z30_HI   : AArch64Reg<30, "z30_hi">;
+def Z31_HI   : AArch64Reg<31, "z31_hi">;
+
+// SVE variable-size vector registers
+let SubRegIndices = [zsub,zsub_hi] in {
+def Z0    : AArch64Reg<0,   "z0",  [Q0,  Z0_HI]>, DwarfRegNum<[96]>;
+def Z1    : AArch64Reg<1,   "z1",  [Q1,  Z1_HI]>, DwarfRegNum<[97]>;
+def Z2    : AArch64Reg<2,   "z2",  [Q2,  Z2_HI]>, DwarfRegNum<[98]>;
+def Z3    : AArch64Reg<3,   "z3",  [Q3,  Z3_HI]>, DwarfRegNum<[99]>;
+def Z4    : AArch64Reg<4,   "z4",  [Q4,  Z4_HI]>, DwarfRegNum<[100]>;
+def Z5    : AArch64Reg<5,   "z5",  [Q5,  Z5_HI]>, DwarfRegNum<[101]>;
+def Z6    : AArch64Reg<6,   "z6",  [Q6,  Z6_HI]>, DwarfRegNum<[102]>;
+def Z7    : AArch64Reg<7,   "z7",  [Q7,  Z7_HI]>, DwarfRegNum<[103]>;
+def Z8    : AArch64Reg<8,   "z8",  [Q8,  Z8_HI]>, DwarfRegNum<[104]>;
+def Z9    : AArch64Reg<9,   "z9",  [Q9,  Z9_HI]>, DwarfRegNum<[105]>;
+def Z10   : AArch64Reg<10, "z10", [Q10, Z10_HI]>, DwarfRegNum<[106]>;
+def Z11   : AArch64Reg<11, "z11", [Q11, Z11_HI]>, DwarfRegNum<[107]>;
+def Z12   : AArch64Reg<12, "z12", [Q12, Z12_HI]>, DwarfRegNum<[108]>;
+def Z13   : AArch64Reg<13, "z13", [Q13, Z13_HI]>, DwarfRegNum<[109]>;
+def Z14   : AArch64Reg<14, "z14", [Q14, Z14_HI]>, DwarfRegNum<[110]>;
+def Z15   : AArch64Reg<15, "z15", [Q15, Z15_HI]>, DwarfRegNum<[111]>;
+def Z16   : AArch64Reg<16, "z16", [Q16, Z16_HI]>, DwarfRegNum<[112]>;
+def Z17   : AArch64Reg<17, "z17", [Q17, Z17_HI]>, DwarfRegNum<[113]>;
+def Z18   : AArch64Reg<18, "z18", [Q18, Z18_HI]>, DwarfRegNum<[114]>;
+def Z19   : AArch64Reg<19, "z19", [Q19, Z19_HI]>, DwarfRegNum<[115]>;
+def Z20   : AArch64Reg<20, "z20", [Q20, Z20_HI]>, DwarfRegNum<[116]>;
+def Z21   : AArch64Reg<21, "z21", [Q21, Z21_HI]>, DwarfRegNum<[117]>;
+def Z22   : AArch64Reg<22, "z22", [Q22, Z22_HI]>, DwarfRegNum<[118]>;
+def Z23   : AArch64Reg<23, "z23", [Q23, Z23_HI]>, DwarfRegNum<[119]>;
+def Z24   : AArch64Reg<24, "z24", [Q24, Z24_HI]>, DwarfRegNum<[120]>;
+def Z25   : AArch64Reg<25, "z25", [Q25, Z25_HI]>, DwarfRegNum<[121]>;
+def Z26   : AArch64Reg<26, "z26", [Q26, Z26_HI]>, DwarfRegNum<[122]>;
+def Z27   : AArch64Reg<27, "z27", [Q27, Z27_HI]>, DwarfRegNum<[123]>;
+def Z28   : AArch64Reg<28, "z28", [Q28, Z28_HI]>, DwarfRegNum<[124]>;
+def Z29   : AArch64Reg<29, "z29", [Q29, Z29_HI]>, DwarfRegNum<[125]>;
+def Z30   : AArch64Reg<30, "z30", [Q30, Z30_HI]>, DwarfRegNum<[126]>;
+def Z31   : AArch64Reg<31, "z31", [Q31, Z31_HI]>, DwarfRegNum<[127]>;
+}
+
+class SVERegOp <string Suffix, AsmOperandClass C,
+                RegisterClass RC> : RegisterOperand<RC> {
+  let PrintMethod = !if(!eq(Suffix, ""),
+                        "printSVERegOp<>",
+                        "printSVERegOp<'" # Suffix # "'>");
+  let ParserMatchClass = C;
+}
+
+class ZPRRegOp <string Suffix, AsmOperandClass C,
+                RegisterClass RC> : SVERegOp<Suffix, C, RC> {}
+
+//******************************************************************************
+
+// SVE vector register class
+def ZPR : RegisterClass<"AArch64",
+                        [nxv16i8, nxv8i16, nxv4i32, nxv2i64,
+                         nxv2f16, nxv4f16, nxv8f16,
+                         nxv1f32, nxv2f32, nxv4f32,
+                         nxv1f64, nxv2f64],
+                        128, (sequence "Z%u", 0, 31)> {
+  let Size = 128;
+}
+
+class ZPRAsmOperand <string name, int Width>: AsmOperandClass {
+  let Name = "SVE" # name # "Reg";
+  let PredicateMethod = "isSVEDataVectorRegOfWidth<" # Width # ">";
+  let RenderMethod = "addRegOperands";
+  let ParserMethod = "tryParseSVEDataVector<"
+                               # !if(!eq(Width, -1), "false", "true") # ">";
+}
+
+def ZPRAsmOpAny : ZPRAsmOperand<"VectorAny", -1>;
+def ZPRAsmOp8   : ZPRAsmOperand<"VectorB",   8>;
+def ZPRAsmOp16  : ZPRAsmOperand<"VectorH",   16>;
+def ZPRAsmOp32  : ZPRAsmOperand<"VectorS",   32>;
+def ZPRAsmOp64  : ZPRAsmOperand<"VectorD",   64>;
+def ZPRAsmOp128 : ZPRAsmOperand<"VectorQ",   128>;
+
+def ZPRAny  : ZPRRegOp<"",  ZPRAsmOpAny, ZPR>;
+def ZPR8    : ZPRRegOp<"b", ZPRAsmOp8,   ZPR>;
+def ZPR16   : ZPRRegOp<"h", ZPRAsmOp16,  ZPR>;
+def ZPR32   : ZPRRegOp<"s", ZPRAsmOp32,  ZPR>;
+def ZPR64   : ZPRRegOp<"d", ZPRAsmOp64,  ZPR>;
+def ZPR128  : ZPRRegOp<"q", ZPRAsmOp128, ZPR>;
diff --git a/lib/Target/AArch64/AArch64SVEInstrInfo.td b/lib/Target/AArch64/AArch64SVEInstrInfo.td
new file mode 100644
index 00000000000..7da0b28d22d
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -0,0 +1,17 @@
+//=- AArch64SVEInstrInfo.td -  AArch64 SVE Instructions -*- tablegen -*-----=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Scalable Vector Extension (SVE) Instruction definitions.
+//
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSVE] in {
+  defm ADD_ZZZ   : sve_int_bin_cons_arit_0<0b000, "add">;
+  defm SUB_ZZZ   : sve_int_bin_cons_arit_0<0b001, "sub">;
+}
diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td
index 18d000ace94..90ebd78f4ab 100644
--- a/lib/Target/AArch64/AArch64SchedA53.td
+++ b/lib/Target/AArch64/AArch64SchedA53.td
@@ -26,6 +26,8 @@ def CortexA53Model : SchedMachineModel {
                              // Specification - Instruction Timings"
                              // v 1.0 Spreadsheet
   let CompleteModel = 1;
+
+  list<Predicate> UnsupportedFeatures = [HasSVE];
 }
 
 
diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td
index 5d1608ef04a..ade03f23f8c 100644
--- a/lib/Target/AArch64/AArch64SchedA57.td
+++ b/lib/Target/AArch64/AArch64SchedA57.td
@@ -31,6 +31,8 @@ def CortexA57Model : SchedMachineModel {
   // experiments and benchmarking data.
   let LoopMicroOpBufferSize = 16;
   let CompleteModel = 1;
+
+  list<Predicate> UnsupportedFeatures = [HasSVE];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SchedCyclone.td b/lib/Target/AArch64/AArch64SchedCyclone.td
index 9fd3ae6818e..7a474ba8ef9 100644
--- a/lib/Target/AArch64/AArch64SchedCyclone.td
+++ b/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -18,6 +18,8 @@ def CycloneModel : SchedMachineModel {
   let LoadLatency = 4; // Optimistic load latency.
   let MispredictPenalty = 16; // 14-19 cycles are typical.
   let CompleteModel = 1;
+
+  list<Predicate> UnsupportedFeatures = [HasSVE];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SchedFalkor.td b/lib/Target/AArch64/AArch64SchedFalkor.td
index 44fd94fc3d4..7277198b585 100644
--- a/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -23,6 +23,8 @@ def FalkorModel : SchedMachineModel {
   let LoadLatency = 3;         // Optimistic load latency.
   let MispredictPenalty = 11;  // Minimum branch misprediction penalty.
   let CompleteModel = 1;
+
+  list<Predicate> UnsupportedFeatures = [HasSVE];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SchedKryo.td b/lib/Target/AArch64/AArch64SchedKryo.td
index 4e491a04c78..ce2afd499af 100644
--- a/lib/Target/AArch64/AArch64SchedKryo.td
+++ b/lib/Target/AArch64/AArch64SchedKryo.td
@@ -27,6 +27,8 @@ def KryoModel : SchedMachineModel {
   // experiments and benchmarking data.
   let LoopMicroOpBufferSize = 16;
   let CompleteModel = 1;
+
+  list<Predicate> UnsupportedFeatures = [HasSVE];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SchedM1.td b/lib/Target/AArch64/AArch64SchedM1.td
index 6133efed020..6c86fcdd29b 100644
--- a/lib/Target/AArch64/AArch64SchedM1.td
+++ b/lib/Target/AArch64/AArch64SchedM1.td
@@ -24,6 +24,8 @@ def ExynosM1Model : SchedMachineModel {
   let LoadLatency           =  4; // Optimistic load cases.
   let MispredictPenalty     = 14; // Minimum branch misprediction penalty.
   let CompleteModel         =  1; // Use the default model otherwise.
+
+  list<Predicate> UnsupportedFeatures = [HasSVE];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SchedThunderX.td b/lib/Target/AArch64/AArch64SchedThunderX.td
index 3cdd2047fbb..585688aae27 100644
--- a/lib/Target/AArch64/AArch64SchedThunderX.td
+++ b/lib/Target/AArch64/AArch64SchedThunderX.td
@@ -25,6 +25,8 @@ def ThunderXT8XModel : SchedMachineModel {
   let MispredictPenalty = 8;  // Branch mispredict penalty.
   let PostRAScheduler = 1;    // Use PostRA scheduler.
   let CompleteModel = 1;
+
+  list<Predicate> UnsupportedFeatures = [HasSVE];
 }
 
 // Modeling each pipeline with BufferSize == 0 since T8X is in-order.
diff --git a/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/lib/Target/AArch64/AArch64SchedThunderX2T99.td
index 4ab7555594a..fd60459382a 100644
--- a/lib/Target/AArch64/AArch64SchedThunderX2T99.td
+++ b/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -25,6 +25,8 @@ def ThunderX2T99Model : SchedMachineModel {
   let LoopMicroOpBufferSize =  32;
   let PostRAScheduler       =   1; // Using PostRA sched.
   let CompleteModel         =   1;
+
+  list<Predicate> UnsupportedFeatures = [HasSVE];
 }
 
 // Define the issue ports.
diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index fe984ccbaf1..78fc322158b 100644
--- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -16,10 +16,10 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/AArch64/AArch64VectorByElementOpt.cpp b/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
index f53af2315ec..2ff644d2bcd 100644
--- a/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
+++ b/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
@@ -33,11 +33,11 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCSchedule.h"
 #include "llvm/Pass.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <map>
 
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 1f06d4065b3..de048a24534 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -59,12 +59,14 @@ using namespace llvm;
 
 namespace {
 
+enum class RegKind {Scalar, NeonVector, SVEDataVector};
+
 class AArch64AsmParser : public MCTargetAsmParser {
 private:
   StringRef Mnemonic; ///< Instruction mnemonic.
 
   // Map of register aliases registers via the .req directive.
-  StringMap<std::pair<bool, unsigned>> RegisterReqs;
+  StringMap<std::pair<RegKind, unsigned>> RegisterReqs;
 
   AArch64TargetStreamer &getTargetStreamer() {
     MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
@@ -77,9 +79,10 @@ private:
   void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S);
   AArch64CC::CondCode parseCondCodeString(StringRef Cond);
   bool parseCondCode(OperandVector &Operands, bool invertCondCode);
-  unsigned matchRegisterNameAlias(StringRef Name, bool isVector);
+  unsigned matchRegisterNameAlias(StringRef Name, RegKind Kind);
   int tryParseRegister();
   int tryMatchVectorRegister(StringRef &Kind, bool expected);
+  int tryParseSVEDataVectorRegister(const AsmToken &Tok, StringRef &Kind);
   bool parseRegister(OperandVector &Operands);
   bool parseSymbolicImmVal(const MCExpr *&ImmVal);
   bool parseVectorList(OperandVector &Operands);
@@ -126,8 +129,10 @@ private:
   OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
   OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands);
   OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands);
-  bool tryParseVectorRegister(OperandVector &Operands);
+  bool tryParseNeonVectorRegister(OperandVector &Operands);
   OperandMatchResultTy tryParseGPRSeqPair(OperandVector &Operands);
+  template <bool ParseSuffix>
+  OperandMatchResultTy tryParseSVEDataVector(OperandVector &Operands);
 
 public:
   enum AArch64MatchResultTy {
@@ -194,7 +199,9 @@ private:
 
   struct RegOp {
     unsigned RegNum;
-    bool isVector;
+    RegKind Kind;
+
+    int ElementWidth;
   };
 
   struct VectorListOp {
@@ -804,34 +811,50 @@ public:
     return SysReg.PStateField != -1U;
   }
 
-  bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
-  bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
+  bool isReg() const override {
+    return Kind == k_Register && Reg.Kind == RegKind::Scalar;
+  }
+
+  bool isNeonVectorReg() const {
+    return Kind == k_Register && Reg.Kind == RegKind::NeonVector;
+  }
 
-  bool isVectorRegLo() const {
-    return Kind == k_Register && Reg.isVector &&
+  bool isNeonVectorRegLo() const {
+    return Kind == k_Register && Reg.Kind == RegKind::NeonVector &&
            AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
                Reg.RegNum);
   }
 
+  template <unsigned Class = AArch64::ZPRRegClassID>
+  bool isSVEDataVectorReg() const {
+    return (Kind == k_Register && Reg.Kind == RegKind::SVEDataVector) &&
+           AArch64MCRegisterClasses[Class].contains(getReg());
+  }
+
+  template <int ElementWidth> bool isSVEDataVectorRegOfWidth() const {
+    return isSVEDataVectorReg() &&
+           (ElementWidth == -1 || Reg.ElementWidth == ElementWidth);
+  }
+
   bool isGPR32as64() const {
-    return Kind == k_Register && !Reg.isVector &&
+    return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
       AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
   }
 
   bool isWSeqPair() const {
-    return Kind == k_Register && !Reg.isVector &&
+    return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
            AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains(
                Reg.RegNum);
   }
 
   bool isXSeqPair() const {
-    return Kind == k_Register && !Reg.isVector &&
+    return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
            AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID].contains(
                Reg.RegNum);
   }
 
   bool isGPR64sp0() const {
-    return Kind == k_Register && !Reg.isVector &&
+    return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
       AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].contains(Reg.RegNum);
   }
 
@@ -1564,10 +1587,22 @@ public:
   }
 
   static std::unique_ptr<AArch64Operand>
-  CreateReg(unsigned RegNum, bool isVector, SMLoc S, SMLoc E, MCContext &Ctx) {
+  CreateReg(unsigned RegNum, RegKind Kind, SMLoc S, SMLoc E, MCContext &Ctx) {
     auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
     Op->Reg.RegNum = RegNum;
-    Op->Reg.isVector = isVector;
+    Op->Reg.Kind = Kind;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand>
+  CreateReg(unsigned RegNum, RegKind Kind, unsigned ElementWidth,
+            SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
+    Op->Reg.RegNum = RegNum;
+    Op->Reg.ElementWidth = ElementWidth;
+    Op->Reg.Kind = Kind;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
@@ -1791,7 +1826,7 @@ static unsigned MatchRegisterName(StringRef Name);
 
 /// }
 
-static unsigned matchVectorRegName(StringRef Name) {
+static unsigned MatchNeonVectorRegName(StringRef Name) {
   return StringSwitch<unsigned>(Name.lower())
       .Case("v0", AArch64::Q0)
       .Case("v1", AArch64::Q1)
@@ -1853,6 +1888,57 @@ static bool isValidVectorKind(StringRef Name) {
       .Default(false);
 }
 
+static unsigned matchSVEDataVectorRegName(StringRef Name) {
+  return StringSwitch<unsigned>(Name.lower())
+      .Case("z0", AArch64::Z0)
+      .Case("z1", AArch64::Z1)
+      .Case("z2", AArch64::Z2)
+      .Case("z3", AArch64::Z3)
+      .Case("z4", AArch64::Z4)
+      .Case("z5", AArch64::Z5)
+      .Case("z6", AArch64::Z6)
+      .Case("z7", AArch64::Z7)
+      .Case("z8", AArch64::Z8)
+      .Case("z9", AArch64::Z9)
+      .Case("z10", AArch64::Z10)
+      .Case("z11", AArch64::Z11)
+      .Case("z12", AArch64::Z12)
+      .Case("z13", AArch64::Z13)
+      .Case("z14", AArch64::Z14)
+      .Case("z15", AArch64::Z15)
+      .Case("z16", AArch64::Z16)
+      .Case("z17", AArch64::Z17)
+      .Case("z18", AArch64::Z18)
+      .Case("z19", AArch64::Z19)
+      .Case("z20", AArch64::Z20)
+      .Case("z21", AArch64::Z21)
+      .Case("z22", AArch64::Z22)
+      .Case("z23", AArch64::Z23)
+      .Case("z24", AArch64::Z24)
+      .Case("z25", AArch64::Z25)
+      .Case("z26", AArch64::Z26)
+      .Case("z27", AArch64::Z27)
+      .Case("z28", AArch64::Z28)
+      .Case("z29", AArch64::Z29)
+      .Case("z30", AArch64::Z30)
+      .Case("z31", AArch64::Z31)
+      .Default(0);
+}
+
+static bool isValidSVEKind(StringRef Name) {
+  return StringSwitch<bool>(Name.lower())
+      .Case(".b", true)
+      .Case(".h", true)
+      .Case(".s", true)
+      .Case(".d", true)
+      .Case(".q", true)
+      .Default(false);
+}
+
+static bool isSVEDataVectorRegister(StringRef Name) {
+  return Name[0] == 'z';
+}
+
 static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
                                  char &ElementKind) {
   assert(isValidVectorKind(Name));
@@ -1881,19 +1967,30 @@ bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
 
 // Matches a register name or register alias previously defined by '.req'
 unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
-                                                  bool isVector) {
-  unsigned RegNum = isVector ? matchVectorRegName(Name)
-                             : MatchRegisterName(Name);
+                                                  RegKind Kind) {
+  unsigned RegNum;
+  switch (Kind) {
+  case RegKind::Scalar:
+    RegNum = MatchRegisterName(Name);
+    break;
+  case RegKind::NeonVector:
+    RegNum = MatchNeonVectorRegName(Name);
+    break;
+  case RegKind::SVEDataVector:
+    RegNum = matchSVEDataVectorRegName(Name);
+    break;
+  }
 
-  if (RegNum == 0) {
+  if (!RegNum) {
     // Check for aliases registered via .req. Canonicalize to lower case.
     // That's more consistent since register names are case insensitive, and
     // it's how the original entry was passed in from MC/MCParser/AsmParser.
     auto Entry = RegisterReqs.find(Name.lower());
     if (Entry == RegisterReqs.end())
       return 0;
+
     // set RegNum if the match is the right kind of register
-    if (isVector == Entry->getValue().first)
+    if (Kind == Entry->getValue().first)
       RegNum = Entry->getValue().second;
   }
   return RegNum;
@@ -1909,7 +2006,10 @@ int AArch64AsmParser::tryParseRegister() {
     return -1;
 
   std::string lowerCase = Tok.getString().lower();
-  unsigned RegNum = matchRegisterNameAlias(lowerCase, false);
+  if (isSVEDataVectorRegister(lowerCase))
+    return -1;
+
+  unsigned RegNum = matchRegisterNameAlias(lowerCase, RegKind::Scalar);
   // Also handle a few aliases of registers.
   if (RegNum == 0)
     RegNum = StringSwitch<unsigned>(lowerCase)
@@ -1940,7 +2040,7 @@ int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
   // a '.'.
   size_t Start = 0, Next = Name.find('.');
   StringRef Head = Name.slice(Start, Next);
-  unsigned RegNum = matchRegisterNameAlias(Head, true);
+  unsigned RegNum = matchRegisterNameAlias(Head, RegKind::NeonVector);
 
   if (RegNum) {
     if (Next != StringRef::npos) {
@@ -2559,8 +2659,8 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
-/// tryParseVectorRegister - Parse a vector register operand.
-bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
+/// tryParseNeonVectorRegister - Parse a vector register operand.
+bool AArch64AsmParser::tryParseNeonVectorRegister(OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
   if (Parser.getTok().isNot(AsmToken::Identifier))
     return true;
@@ -2572,7 +2672,9 @@ bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
   if (Reg == -1)
     return true;
   Operands.push_back(
-      AArch64Operand::CreateReg(Reg, true, S, getLoc(), getContext()));
+      AArch64Operand::CreateReg(Reg, RegKind::NeonVector, S, getLoc(),
+                                getContext()));
+
   // If there was an explicit qualifier, that goes on as a literal text
   // operand.
   if (!Kind.empty())
@@ -2603,19 +2705,48 @@ bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
   return false;
 }
 
+// tryParseSVEDataVectorRegister - Try to parse a SVE vector register name with
+// optional kind specifier. If it is a register specifier, eat the token
+// and return it.
+int AArch64AsmParser::tryParseSVEDataVectorRegister(const AsmToken &Tok,
+                                                StringRef &Kind) {
+  if (Tok.isNot(AsmToken::Identifier))
+    return -1;
+
+  StringRef Name = Tok.getString();
+  // If there is a kind specifier, it's separated from the register name by
+  // a '.'.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Head = Name.slice(Start, Next);
+  unsigned RegNum = matchRegisterNameAlias(Head, RegKind::SVEDataVector);
+
+  if (RegNum) {
+    if (Next != StringRef::npos) {
+      Kind = Name.slice(Next, StringRef::npos);
+      if (!isValidSVEKind(Kind)) {
+        TokError("invalid sve vector kind qualifier");
+        return -1;
+      }
+    }
+    return RegNum;
+  }
+
+  return -1;
+}
+
 /// parseRegister - Parse a non-vector register operand.
 bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
   SMLoc S = getLoc();
-  // Try for a vector register.
-  if (!tryParseVectorRegister(Operands))
+  // Try for a vector (neon) register.
+  if (!tryParseNeonVectorRegister(Operands))
     return false;
 
   // Try for a scalar register.
   int64_t Reg = tryParseRegister();
   if (Reg == -1)
     return true;
-  Operands.push_back(
-      AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
+  Operands.push_back(AArch64Operand::CreateReg(Reg, RegKind::Scalar, S,
+      getLoc(), getContext()));
 
   return false;
 }
@@ -2783,7 +2914,7 @@ AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
   if (!Tok.is(AsmToken::Identifier))
     return MatchOperand_NoMatch;
 
-  unsigned RegNum = matchRegisterNameAlias(Tok.getString().lower(), false);
+  unsigned RegNum = matchRegisterNameAlias(Tok.getString().lower(), RegKind::Scalar);
 
   MCContext &Ctx = getContext();
   const MCRegisterInfo *RI = Ctx.getRegisterInfo();
@@ -2795,7 +2926,7 @@ AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
 
   if (!parseOptionalToken(AsmToken::Comma)) {
     Operands.push_back(
-        AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+        AArch64Operand::CreateReg(RegNum, RegKind::Scalar, S, getLoc(), Ctx));
     return MatchOperand_Success;
   }
 
@@ -2814,7 +2945,7 @@ AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
   }
 
   Operands.push_back(
-      AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+      AArch64Operand::CreateReg(RegNum, RegKind::Scalar, S, getLoc(), Ctx));
   return MatchOperand_Success;
 }
 
@@ -3529,8 +3660,8 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         Operands[0] = AArch64Operand::CreateToken(
               "bfm", false, Op.getStartLoc(), getContext());
         Operands[2] = AArch64Operand::CreateReg(
-            RegWidth == 32 ? AArch64::WZR : AArch64::XZR, false, SMLoc(),
-            SMLoc(), getContext());
+            RegWidth == 32 ? AArch64::WZR : AArch64::XZR, RegKind::Scalar,
+            SMLoc(), SMLoc(), getContext());
         Operands[3] = AArch64Operand::CreateImm(
             ImmRExpr, LSBOp.getStartLoc(), LSBOp.getEndLoc(), getContext());
         Operands.emplace_back(
@@ -3666,8 +3797,9 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
     if (Op.isReg()) {
       unsigned Reg = getXRegFromWReg(Op.getReg());
-      Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
-                                              Op.getEndLoc(), getContext());
+      Operands[2] = AArch64Operand::CreateReg(Reg, RegKind::Scalar,
+                                              Op.getStartLoc(), Op.getEndLoc(),
+                                              getContext());
     }
   }
   // FIXME: Likewise for sxt[bh] with a Xd dst operand
@@ -3681,7 +3813,8 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
       if (Op.isReg()) {
         unsigned Reg = getXRegFromWReg(Op.getReg());
-        Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+        Operands[2] = AArch64Operand::CreateReg(Reg, RegKind::Scalar,
+                                                Op.getStartLoc(),
                                                 Op.getEndLoc(), getContext());
       }
     }
@@ -3697,7 +3830,8 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
       if (Op.isReg()) {
         unsigned Reg = getWRegFromXReg(Op.getReg());
-        Operands[1] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+        Operands[1] = AArch64Operand::CreateReg(Reg, RegKind::Scalar,
+                                                Op.getStartLoc(),
                                                 Op.getEndLoc(), getContext());
       }
     }
@@ -4158,14 +4292,25 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
   Parser.Lex(); // Eat the '.req' token.
   SMLoc SRegLoc = getLoc();
   unsigned RegNum = tryParseRegister();
-  bool IsVector = false;
+  RegKind RegisterKind = RegKind::Scalar;
 
   if (RegNum == static_cast<unsigned>(-1)) {
     StringRef Kind;
+    RegisterKind = RegKind::NeonVector;
     RegNum = tryMatchVectorRegister(Kind, false);
     if (!Kind.empty())
       return Error(SRegLoc, "vector register without type specifier expected");
-    IsVector = true;
+  }
+
+  if (RegNum == static_cast<unsigned>(-1)) {
+    StringRef Kind;
+    RegisterKind = RegKind::SVEDataVector;
+    int RegNumTmp = tryParseSVEDataVectorRegister(Parser.getTok(), Kind);
+    if (RegNumTmp != -1)
+      Parser.Lex();
+    RegNum = RegNumTmp;
+    if (!Kind.empty())
+      return Error(SRegLoc, "sve vector register without type specifier expected");
   }
 
   if (RegNum == static_cast<unsigned>(-1))
@@ -4176,7 +4321,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
                  "unexpected input in .req directive"))
     return true;
 
-  auto pair = std::make_pair(IsVector, RegNum);
+  auto pair = std::make_pair(RegisterKind, RegNum);
   if (RegisterReqs.insert(std::make_pair(Name, pair)).first->second != pair)
     Warning(L, "ignoring redefinition of register alias '" + Name + "'");
 
@@ -4388,8 +4533,43 @@ AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
            &AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID]);
   }
 
-  Operands.push_back(AArch64Operand::CreateReg(Pair, false, S, getLoc(),
-      getContext()));
+  Operands.push_back(AArch64Operand::CreateReg(Pair, RegKind::Scalar, S,
+      getLoc(), getContext()));
+
+  return MatchOperand_Success;
+}
+
+template <bool ParseSuffix>
+OperandMatchResultTy
+AArch64AsmParser::tryParseSVEDataVector(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  const SMLoc S = getLoc();
+  // Check for a SVE vector register specifier first.
+  StringRef Kind;
+  int RegNum = tryParseSVEDataVectorRegister(Parser.getTok(), Kind);
+  if (RegNum == -1)
+    return MatchOperand_NoMatch;
+
+  // Eat the SVE Register Token
+  Parser.Lex();
+
+  if (ParseSuffix && Kind.empty())
+    return MatchOperand_NoMatch;
+
+  unsigned ElementWidth = StringSwitch<unsigned>(Kind.lower())
+                        .Case("", -1)
+                        .Case(".b", 8)
+                        .Case(".h", 16)
+                        .Case(".s", 32)
+                        .Case(".d", 64)
+                        .Case(".q", 128)
+                        .Default(0);
+  if (!ElementWidth)
+    return MatchOperand_NoMatch;
+
+  Operands.push_back(
+    AArch64Operand::CreateReg(RegNum, RegKind::SVEDataVector, ElementWidth,
+                              S, S, getContext()));
 
   return MatchOperand_Success;
 }
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 73db32c0487..aea1b4f2d2c 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -85,6 +85,9 @@ static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
                                             const void *Decoder);
+static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decode);
 
 static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm,
                                                uint64_t Address,
@@ -436,6 +439,27 @@ static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
   Inst.addOperand(MCOperand::createReg(Register));
   return Success;
 }
+static const unsigned ZPRDecoderTable[] = {
+    AArch64::Z0,  AArch64::Z1,  AArch64::Z2,  AArch64::Z3,
+    AArch64::Z4,  AArch64::Z5,  AArch64::Z6,  AArch64::Z7,
+    AArch64::Z8,  AArch64::Z9,  AArch64::Z10, AArch64::Z11,
+    AArch64::Z12, AArch64::Z13, AArch64::Z14, AArch64::Z15,
+    AArch64::Z16, AArch64::Z17, AArch64::Z18, AArch64::Z19,
+    AArch64::Z20, AArch64::Z21, AArch64::Z22, AArch64::Z23,
+    AArch64::Z24, AArch64::Z25, AArch64::Z26, AArch64::Z27,
+    AArch64::Z28, AArch64::Z29, AArch64::Z30, AArch64::Z31
+};
+
+static DecodeStatus DecodeZPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void* Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = ZPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
 
 static const unsigned VectorDecoderTable[] = {
     AArch64::Q0,  AArch64::Q1,  AArch64::Q2,  AArch64::Q3,  AArch64::Q4,
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index 62e5d02f603..bdf71b095fd 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -1340,3 +1340,23 @@ void AArch64InstPrinter::printComplexRotationOp(const MCInst *MI, unsigned OpNo,
   O << "#" << (Val * Angle) + Remainder;
 }
 
+template <char suffix>
+void AArch64InstPrinter::printSVERegOp(const MCInst *MI, unsigned OpNum,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
+  switch (suffix) {
+  case 0:
+  case 'b':
+  case 'h':
+  case 's':
+  case 'd':
+  case 'q':
+    break;
+  default: llvm_unreachable("Invalid kind specifier.");
+  }
+
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  O << getRegisterName(Reg);
+  if (suffix != 0)
+    O << '.' << suffix;
+}
+\ No newline at end of file
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 8515ad24c71..76f20f042ce 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -165,6 +165,9 @@ protected:
   void printGPRSeqPairsClassOperand(const MCInst *MI, unsigned OpNum,
                                     const MCSubtargetInfo &STI,
                                     raw_ostream &O);
+  template <char = 0>
+  void printSVERegOp(const MCInst *MI, unsigned OpNum,
+                    const MCSubtargetInfo &STI, raw_ostream &O);
 };
 
 class AArch64AppleInstPrinter : public AArch64InstPrinter {
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 7fba4849438..c5da457c38f 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -106,13 +106,15 @@ AArch64MCAsmInfoCOFF::AArch64MCAsmInfoCOFF() {
   PrivateLabelPrefix = ".L";
   AlignmentIsInBytes = false;
   SupportsDebugInformation = true;
-  ExceptionsType = ExceptionHandling::WinEH;
+  CodePointerSize = 8;
 }
 
 AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
   CommentString = ";";
+  ExceptionsType = ExceptionHandling::WinEH;
 }
 
 AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
   CommentString = "//";
+  ExceptionsType = ExceptionHandling::DwarfCFI;
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index 9d0f39e5f6a..c88363d2c25 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -23,7 +23,15 @@ public:
                          std::unique_ptr<MCCodeEmitter> CE,
                          raw_pwrite_stream &OS)
       : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), OS) {}
+
+  void FinishImpl() override;
 };
+
+void AArch64WinCOFFStreamer::FinishImpl() {
+  EmitFrames(nullptr);
+
+  MCWinCOFFStreamer::FinishImpl();
+}
 } // end anonymous namespace
 
 namespace llvm {
diff --git a/lib/Target/AArch64/SVEInstrFormats.td b/lib/Target/AArch64/SVEInstrFormats.td
new file mode 100644
index 00000000000..e74bab8b7fe
--- /dev/null
+++ b/lib/Target/AArch64/SVEInstrFormats.td
@@ -0,0 +1,41 @@
+//=-- SVEInstrFormats.td -  AArch64 SVE Instruction classes -*- tablegen -*--=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Scalable Vector Extension (SVE) Instruction Class Definitions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Arithmetic - Unpredicated Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_bin_cons_arit_0<bits<2> sz8_64, bits<3> opc, string asm,
+                              ZPRRegOp zprty>
+: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
+  asm, "\t$Zd, $Zn, $Zm",
+  "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zm;
+  bits<5> Zn;
+  let Inst{31-24} = 0b00000100;
+  let Inst{23-22} = sz8_64;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Zm;
+  let Inst{15-13} = 0b000;
+  let Inst{12-10} = opc;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm> {
+  def _B : sve_int_bin_cons_arit_0<0b00, opc, asm, ZPR8>;
+  def _H : sve_int_bin_cons_arit_0<0b01, opc, asm, ZPR16>;
+  def _S : sve_int_bin_cons_arit_0<0b10, opc, asm, ZPR32>;
+  def _D : sve_int_bin_cons_arit_0<0b11, opc, asm, ZPR64>;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 156f7bc6512..b17b6716766 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -400,7 +400,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
     return false;
 
   FastMathFlags FMF = FPOp->getFastMathFlags();
-  bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
+  bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
                                       FMF.allowReciprocal();
 
   // With UnsafeDiv node will be optimized to just rcp and mul.
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 2329fffd521..91fe921bfee 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -15,7 +15,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
 
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
 
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c313e4a04ef..f04efd71fa0 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -204,6 +204,7 @@ private:
   void SelectADD_SUB_I64(SDNode *N);
   void SelectUADDO_USUBO(SDNode *N);
   void SelectDIV_SCALE(SDNode *N);
+  void SelectMAD_64_32(SDNode *N);
   void SelectFMA_W_CHAIN(SDNode *N);
   void SelectFMUL_W_CHAIN(SDNode *N);
 
@@ -594,6 +595,11 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     SelectDIV_SCALE(N);
     return;
   }
+  case AMDGPUISD::MAD_I64_I32:
+  case AMDGPUISD::MAD_U64_U32: {
+    SelectMAD_64_32(N);
+    return;
+  }
   case ISD::CopyToReg: {
     const SITargetLowering& Lowering =
       *static_cast<const SITargetLowering*>(getTargetLowering());
@@ -814,6 +820,19 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
 }
 
+// We need to handle this here because tablegen doesn't support matching
+// instructions with multiple outputs.
+void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
+  SDLoc SL(N);
+  bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
+  unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32 : AMDGPU::V_MAD_U64_U32;
+
+  SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
+  SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                    Clamp };
+  CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
+}
+
 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
                                          unsigned OffsetBits) const {
   if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index fe2c9337721..d502b77447d 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -128,27 +128,20 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
 }
 
-bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op)
-{
-  assert(Op.getOpcode() == ISD::OR);
-
-  SDValue N0 = Op->getOperand(0);
-  SDValue N1 = Op->getOperand(1);
-  EVT VT = N0.getValueType();
-
-  if (VT.isInteger() && !VT.isVector()) {
-    KnownBits LHSKnown, RHSKnown;
-    DAG.computeKnownBits(N0, LHSKnown);
+unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
+  KnownBits Known;
+  EVT VT = Op.getValueType();
+  DAG.computeKnownBits(Op, Known);
 
-    if (LHSKnown.Zero.getBoolValue()) {
-      DAG.computeKnownBits(N1, RHSKnown);
+  return VT.getSizeInBits() - Known.countMinLeadingZeros();
+}
 
-      if (!(~RHSKnown.Zero & ~LHSKnown.Zero))
-        return true;
-    }
-  }
+unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
 
-  return false;
+  // In order for this to be a signed 24-bit value, bit 23, must
+  // be a sign bit.
+  return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
 }
 
 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
@@ -2615,21 +2608,14 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
 //===----------------------------------------------------------------------===//
 
 static bool isU24(SDValue Op, SelectionDAG &DAG) {
-  KnownBits Known;
-  EVT VT = Op.getValueType();
-  DAG.computeKnownBits(Op, Known);
-
-  return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24;
+  return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
 }
 
 static bool isI24(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
-
-  // In order for this to be a signed 24-bit value, bit 23, must
-  // be a sign bit.
   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
                                      // as unsigned 24-bit values.
-         (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
+    AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
 }
 
 static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
@@ -2914,21 +2900,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
     SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
     return DAG.getZExtOrTrunc(Shl, SL, VT);
   }
-  case ISD::OR:
-    if (!isOrEquivalentToAdd(DAG, LHS))
-      break;
-    LLVM_FALLTHROUGH;
-  case ISD::ADD: {
-    // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1)
-    if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
-      SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0),
-                                SDValue(RHS, 0));
-      SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal,
-                                    SDLoc(C2), VT);
-      return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V);
-    }
-    break;
-  }
   }
 
   if (VT != MVT::i64)
@@ -3946,6 +3917,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(MUL_LOHI_I24)
   NODE_NAME_CASE(MAD_U24)
   NODE_NAME_CASE(MAD_I24)
+  NODE_NAME_CASE(MAD_I64_I32)
+  NODE_NAME_CASE(MAD_U64_U32)
   NODE_NAME_CASE(TEXTURE_FETCH)
   NODE_NAME_CASE(EXPORT)
   NODE_NAME_CASE(EXPORT_DONE)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index cdb15186f86..ba35aeb90ed 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -35,7 +35,8 @@ private:
   SDValue getFFBX_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, unsigned Opc) const;
 
 public:
-  static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op);
+  static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG);
+  static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG);
 
 protected:
   const AMDGPUSubtarget *Subtarget;
@@ -379,6 +380,8 @@ enum NodeType : unsigned {
   MULHI_I24,
   MAD_U24,
   MAD_I24,
+  MAD_U64_U32,
+  MAD_I64_I32,
   MUL_LOHI_I24,
   MUL_LOHI_U24,
   TEXTURE_FETCH,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 41cc7d7093e..f1a42b42f1f 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -18,7 +18,7 @@
 
 #include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "AMDGPUGenInstrInfo.inc"
diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index e7e54750fe6..714c60a7446 100644
--- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -487,7 +487,7 @@ bool AMDGPULibCalls::parseFunctionName(const StringRef& FMangledName,
 
 bool AMDGPULibCalls::isUnsafeMath(const CallInst *CI) const {
   if (auto Op = dyn_cast<FPMathOperator>(CI))
-    if (Op->hasUnsafeAlgebra())
+    if (Op->isFast())
       return true;
   const Function *F = CI->getParent()->getParent();
   Attribute Attr = F->getFnAttribute("unsafe-fp-math");
@@ -1337,7 +1337,8 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
   // for OpenCL 2.0 we have only generic implementation of sincos
   // function.
   AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
-  nf.getLeads()[0].PtrKind = AMDGPULibFunc::GENERIC;
+  const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*M);
+  nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AS.FLAT_ADDRESS);
   Function *Fsincos = dyn_cast_or_null<Function>(getFunction(M, nf));
   if (!Fsincos) return false;
 
@@ -1350,7 +1351,6 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
   // The allocaInst allocates the memory in private address space. This need
   // to be bitcasted to point to the address space of cos pointer type.
   // In OpenCL 2.0 this is generic, while in 1.2 that is private.
-  const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*M);
   if (PTy->getPointerAddressSpace() != AS.PRIVATE_ADDRESS)
     P = B.CreateAddrSpaceCast(Alloc, PTy);
   CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P);
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index 919e8c1e13c..4671273d61f 100644
--- a/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPU.h"
 #include "AMDGPULibFunc.h"
 #include <llvm/ADT/SmallString.h>
 #include <llvm/ADT/SmallVector.h>
@@ -458,13 +459,16 @@ AMDGPULibFunc::Param ParamIterator::getNextParam() {
       P.ArgType = AMDGPULibFunc::I32;
       break;
 
-    case E_CONSTPTR_SWAPGL:
-      switch (P.PtrKind & AMDGPULibFunc::ADDR_SPACE) {
-      case AMDGPULibFunc::GLOBAL: P.PtrKind = AMDGPULibFunc::LOCAL; break;
-      case AMDGPULibFunc::LOCAL:  P.PtrKind = AMDGPULibFunc::GLOBAL; break;
+    case E_CONSTPTR_SWAPGL: {
+      unsigned AS = AMDGPULibFunc::getAddrSpaceFromEPtrKind(P.PtrKind);
+      switch (AS) {
+      case AMDGPUAS::GLOBAL_ADDRESS: AS = AMDGPUAS::LOCAL_ADDRESS; break;
+      case AMDGPUAS::LOCAL_ADDRESS:  AS = AMDGPUAS::GLOBAL_ADDRESS; break;
       }
+      P.PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AS);
       P.PtrKind |= AMDGPULibFunc::CONST;
       break;
+    }
 
     default: llvm_unreachable("Unhandeled param rule");
     }
@@ -590,19 +594,14 @@ bool ItaniumParamParser::parseItaniumParam(StringRef& param,
   if (eatTerm(param, 'P')) {
     if (eatTerm(param, 'K')) res.PtrKind |= AMDGPULibFunc::CONST;
     if (eatTerm(param, 'V')) res.PtrKind |= AMDGPULibFunc::VOLATILE;
+    unsigned AS;
     if (!eatTerm(param, "U3AS")) {
-      res.PtrKind |= AMDGPULibFunc::PRIVATE;
+      AS = 0;
     } else {
-      switch(param.front()) {
-      case '1': res.PtrKind |= AMDGPULibFunc::GLOBAL;  break;
-      case '2': res.PtrKind |= AMDGPULibFunc::READONLY;break;
-      case '3': res.PtrKind |= AMDGPULibFunc::LOCAL;   break;
-      case '4': res.PtrKind |= AMDGPULibFunc::GENERIC; break;
-      case '5': res.PtrKind |= AMDGPULibFunc::OTHER;   break;
-      default: return false;
-      }
+      AS = param.front() - '0';
       drop_front(param, 1);
     }
+    res.PtrKind |= AMDGPULibFuncBase::getEPtrKindFromAddrSpace(AS);
   } else {
     res.PtrKind = AMDGPULibFunc::BYVALUE;
   }
@@ -837,7 +836,9 @@ public:
       os << 'P';
       if (p.PtrKind & AMDGPULibFunc::CONST) os << 'K';
       if (p.PtrKind & AMDGPULibFunc::VOLATILE) os << 'V';
-      int AS = UseAddrSpace ? (p.PtrKind & AMDGPULibFunc::ADDR_SPACE)-1 : 0;
+      unsigned AS = UseAddrSpace
+                        ? AMDGPULibFuncBase::getAddrSpaceFromEPtrKind(p.PtrKind)
+                        : 0;
       if (AS != 0) os << "U3AS" << AS;
       Ptr = p;
       p.PtrKind = 0;
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.h b/lib/Target/AMDGPU/AMDGPULibFunc.h
index 8f4297b4a49..5405bc64571 100644
--- a/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -283,14 +283,7 @@ public:
 
   enum EPtrKind {
     BYVALUE = 0,
-    PRIVATE,
-    GLOBAL,
-    READONLY,
-    LOCAL,
-    GENERIC,
-    OTHER,
-
-    ADDR_SPACE = 0xF,
+    ADDR_SPACE = 0xF, // Address space takes value 0x1 ~ 0xF.
     CONST      = 0x10,
     VOLATILE   = 0x20
   };
@@ -315,6 +308,17 @@ public:
   static bool isMangled(EFuncId Id) {
     return static_cast<unsigned>(Id) <= static_cast<unsigned>(EI_LAST_MANGLED);
   }
+
+  static unsigned getEPtrKindFromAddrSpace(unsigned AS) {
+    assert(((AS + 1) & ~ADDR_SPACE) == 0);
+    return AS + 1;
+  }
+
+  static unsigned getAddrSpaceFromEPtrKind(unsigned Kind) {
+    Kind = Kind & ADDR_SPACE;
+    assert(Kind >= 1);
+    return Kind - 1;
+  }
 };
 
 class AMDGPULibFuncImpl : public AMDGPULibFuncBase {
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 9fc9592bdc5..83122281d2b 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -23,7 +23,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/MDBuilder.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include <algorithm>
 
 using namespace llvm;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 56a5fa634b5..6ee529c8549 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -462,6 +462,10 @@ public:
     return isAmdHsaOS() || isMesaKernel(MF);
   }
 
+  bool hasMad64_32() const {
+    return getGeneration() >= SEA_ISLANDS;
+  }
+
   bool hasFminFmaxLegacy() const {
     return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
   }
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f7ecdea7704..14f26f787ab 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -245,6 +245,10 @@ GCNMinRegSchedRegistry("gcn-minreg",
 static StringRef computeDataLayout(const Triple &TT) {
   if (TT.getArch() == Triple::r600) {
     // 32-bit pointers.
+    if (TT.getEnvironmentName() == "amdgiz" ||
+        TT.getEnvironmentName() == "amdgizcl")
+      return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+             "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
     return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
             "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
   }
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index d729dcc439e..d1120f5e330 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2134,12 +2134,17 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    // FIXME: Remove this hack for function pointer types.
-    const GlobalValue *GV = GA->getGlobal();
-    assert(Callee.getValueType() == MVT::i32);
-    Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(),
-                                  false, GA->getTargetFlags());
+    // FIXME: Remove this hack for function pointer types after removing
+    // support of old address space mapping. In the new address space
+    // mapping the pointer in default address space is 64 bit, therefore
+    // does not need this hack.
+    if (Callee.getValueType() == MVT::i32) {
+      const GlobalValue *GV = GA->getGlobal();
+      Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(), false,
+                                    GA->getTargetFlags());
+    }
   }
+  assert(Callee.getValueType() == MVT::i64);
 
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
@@ -5957,18 +5962,57 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
   return 0;
 }
 
+static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
+                           EVT VT,
+                           SDValue N0, SDValue N1, SDValue N2,
+                           bool Signed) {
+  unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
+  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
+  SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
+  return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
+}
+
 SDValue SITargetLowering::performAddCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
-
-  if (VT != MVT::i32)
-    return SDValue();
-
   SDLoc SL(N);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
 
+  if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
+      && Subtarget->hasMad64_32() &&
+      !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
+      VT.getScalarSizeInBits() <= 64) {
+    if (LHS.getOpcode() != ISD::MUL)
+      std::swap(LHS, RHS);
+
+    SDValue MulLHS = LHS.getOperand(0);
+    SDValue MulRHS = LHS.getOperand(1);
+    SDValue AddRHS = RHS;
+
+    // TODO: Maybe restrict if SGPR inputs.
+    if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
+        numBitsUnsigned(MulRHS, DAG) <= 32) {
+      MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
+      MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
+      AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
+      return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
+    }
+
+    if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
+      MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
+      MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
+      AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
+      return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
+    }
+
+    return SDValue();
+  }
+
+  if (VT != MVT::i32)
+    return SDValue();
+
   // add x, zext (setcc) => addcarry x, 0, setcc
   // add x, sext (setcc) => subcarry x, 0, setcc
   unsigned Opc = LHS.getOpcode();
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index ade909cc84e..5dde72910ee 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -14,15 +14,15 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
 
-#include "AMDGPUMachineFunction.h"
 #include "AMDGPUArgumentUsageInfo.h"
+#include "AMDGPUMachineFunction.h"
 #include "SIRegisterInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <array>
@@ -87,9 +87,6 @@ public:
 /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
 /// tells the hardware which interpolation parameters to load.
 class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
-  // FIXME: This should be removed and getPreloadedValue moved here.
-  friend class SIRegisterInfo;
-
   unsigned TIDReg = AMDGPU::NoRegister;
 
   // Registers that may be reserved for spilling purposes. These may be the same
@@ -143,7 +140,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
 
 private:
   unsigned LDSWaveSpillSize = 0;
-  unsigned ScratchOffsetReg;
   unsigned NumUserSGPRs = 0;
   unsigned NumSystemSGPRs = 0;
 
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index aa041aab51c..666b80107dc 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -399,8 +399,10 @@ def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_
 } // End Constraints = "@earlyclobber $vdst"
 
 let isCommutable = 1 in {
+let SchedRW = [WriteDouble, WriteSALU] in {
 def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
 def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
+} // End SchedRW = [WriteDouble, WriteSALU]
 } // End isCommutable = 1
 
 } // End SubtargetPredicate = isCIVI
diff --git a/lib/Target/ARC/ARCBranchFinalize.cpp b/lib/Target/ARC/ARCBranchFinalize.cpp
index 0fb8a420d86..e5b0f8f3208 100644
--- a/lib/Target/ARC/ARCBranchFinalize.cpp
+++ b/lib/Target/ARC/ARCBranchFinalize.cpp
@@ -20,8 +20,8 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include <vector>
 
 using namespace llvm;
diff --git a/lib/Target/ARC/ARCFrameLowering.h b/lib/Target/ARC/ARCFrameLowering.h
index ac5378adbd8..c042bec016c 100644
--- a/lib/Target/ARC/ARCFrameLowering.h
+++ b/lib/Target/ARC/ARCFrameLowering.h
@@ -17,7 +17,7 @@
 #include "ARC.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
 
diff --git a/lib/Target/ARC/ARCInstrInfo.h b/lib/Target/ARC/ARCInstrInfo.h
index 5285dce9f12..f965dd4ff7f 100644
--- a/lib/Target/ARC/ARCInstrInfo.h
+++ b/lib/Target/ARC/ARCInstrInfo.h
@@ -15,7 +15,7 @@
 #define LLVM_LIB_TARGET_ARC_ARCINSTRINFO_H
 
 #include "ARCRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "ARCGenInstrInfo.inc"
diff --git a/lib/Target/ARC/ARCRegisterInfo.cpp b/lib/Target/ARC/ARCRegisterInfo.cpp
index 66f95911d3e..bed47a0eab5 100644
--- a/lib/Target/ARC/ARCRegisterInfo.cpp
+++ b/lib/Target/ARC/ARCRegisterInfo.cpp
@@ -25,7 +25,7 @@
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 3688db943d5..dac11626a6f 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -37,6 +37,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
@@ -53,7 +54,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <algorithm>
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 9f168acd567..99bcf788ddf 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -21,7 +21,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include <array>
 #include <cstdint>
 
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 63b14ee98d7..eaa8d4c0f1a 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -31,6 +31,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
@@ -41,7 +42,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index e1323cd9427..9c10a1c79a4 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -417,6 +417,12 @@ struct FormalArgHandler : public IncomingValueHandler {
 bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                            const Function &F,
                                            ArrayRef<unsigned> VRegs) const {
+  auto &TLI = *getTLI<ARMTargetLowering>();
+  auto Subtarget = TLI.getSubtarget();
+
+  if (Subtarget->isThumb())
+    return false;
+
   // Quick exit if there aren't any args
   if (F.arg_empty())
     return true;
@@ -427,12 +433,6 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   auto &MF = MIRBuilder.getMF();
   auto &MBB = MIRBuilder.getMBB();
   auto DL = MF.getDataLayout();
-  auto &TLI = *getTLI<ARMTargetLowering>();
-
-  auto Subtarget = TLI.getSubtarget();
-
-  if (Subtarget->isThumb())
-    return false;
 
   for (auto &Arg : F.args())
     if (!isSupportedType(DL, TLI, Arg.getType()))
diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h
index 71b81936240..284b67fd59b 100644
--- a/lib/Target/ARM/ARMCallingConv.h
+++ b/lib/Target/ARM/ARMCallingConv.h
@@ -19,8 +19,8 @@
 #include "ARMBaseInstrInfo.h"
 #include "ARMSubtarget.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/CallingConv.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 namespace llvm {
 
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 5c967d9f906..f42d00ecf60 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -43,6 +43,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
@@ -72,7 +73,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOpcodes.h"
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index ce4add974d6..ab8fbb47086 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -34,6 +34,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DebugLoc.h"
@@ -49,7 +50,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetOptions.h"
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index 2c10031e3f8..1f18e2bf80c 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -11,7 +11,7 @@
 #define LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include <vector>
 
 namespace llvm {
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 9cfa0d3a7c3..deece84ecf2 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -57,6 +57,7 @@
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
@@ -94,7 +95,6 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOpcodes.h"
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 309430b0e9c..34186dede0d 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -24,6 +24,54 @@
 
 using namespace llvm;
 
+/// FIXME: The following static functions are SizeChangeStrategy functions
+/// that are meant to temporarily mimic the behaviour of the old legalization
+/// based on doubling/halving non-legal types as closely as possible. This is
+/// not entirly possible as only legalizing the types that are exactly a power
+/// of 2 times the size of the legal types would require specifying all those
+/// sizes explicitly.
+/// In practice, not specifying those isn't a problem, and the below functions
+/// should disappear quickly as we add support for legalizing non-power-of-2
+/// sized types further.
+static void
+addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result,
+                                const LegalizerInfo::SizeAndActionsVec &v) {
+  for (unsigned i = 0; i < v.size(); ++i) {
+    result.push_back(v[i]);
+    if (i + 1 < v[i].first && i + 1 < v.size() &&
+        v[i + 1].first != v[i].first + 1)
+      result.push_back({v[i].first + 1, LegalizerInfo::Unsupported});
+  }
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_8_16(const LegalizerInfo::SizeAndActionsVec &v) {
+  assert(v.size() >= 1);
+  assert(v[0].first > 17);
+  LegalizerInfo::SizeAndActionsVec result = {
+      {1, LegalizerInfo::Unsupported},
+      {8, LegalizerInfo::WidenScalar},  {9, LegalizerInfo::Unsupported},
+      {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}};
+  addAndInterleaveWithUnsupported(result, v);
+  auto Largest = result.back().first;
+  result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+  return result;
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_1_8_16(const LegalizerInfo::SizeAndActionsVec &v) {
+  assert(v.size() >= 1);
+  assert(v[0].first > 17);
+  LegalizerInfo::SizeAndActionsVec result = {
+      {1, LegalizerInfo::WidenScalar},  {2, LegalizerInfo::Unsupported},
+      {8, LegalizerInfo::WidenScalar},  {9, LegalizerInfo::Unsupported},
+      {16, LegalizerInfo::WidenScalar}, {17, LegalizerInfo::Unsupported}};
+  addAndInterleaveWithUnsupported(result, v);
+  auto Largest = result.back().first;
+  result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+  return result;
+}
+
 static bool AEABI(const ARMSubtarget &ST) {
   return ST.isTargetAEABI() || ST.isTargetGNUAEABI() || ST.isTargetMuslAEABI();
 }
@@ -49,14 +97,15 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
   }
 
   for (unsigned Op : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) {
-    for (auto Ty : {s1, s8, s16})
-      setAction({Op, Ty}, WidenScalar);
+    if (Op != G_ADD)
+      setLegalizeScalarToDifferentSizeStrategy(
+          Op, 0, widenToLargerTypesUnsupportedOtherwise);
     setAction({Op, s32}, Legal);
   }
 
   for (unsigned Op : {G_SDIV, G_UDIV}) {
-    for (auto Ty : {s8, s16})
-      setAction({Op, Ty}, WidenScalar);
+    setLegalizeScalarToDifferentSizeStrategy(Op, 0,
+        widenToLargerTypesUnsupportedOtherwise);
     if (ST.hasDivideInARMMode())
       setAction({Op, s32}, Legal);
     else
@@ -64,8 +113,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
   }
 
   for (unsigned Op : {G_SREM, G_UREM}) {
-    for (auto Ty : {s8, s16})
-      setAction({Op, Ty}, WidenScalar);
+    setLegalizeScalarToDifferentSizeStrategy(Op, 0, widen_8_16);
     if (ST.hasDivideInARMMode())
       setAction({Op, s32}, Lower);
     else if (AEABI(ST))
@@ -74,10 +122,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
       setAction({Op, s32}, Libcall);
   }
 
-  for (unsigned Op : {G_SEXT, G_ZEXT}) {
+  for (unsigned Op : {G_SEXT, G_ZEXT, G_ANYEXT}) {
     setAction({Op, s32}, Legal);
-    for (auto Ty : {s1, s8, s16})
-      setAction({Op, 1, Ty}, Legal);
   }
 
   for (unsigned Op : {G_ASHR, G_LSHR, G_SHL})
@@ -93,12 +139,11 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
   setAction({G_BRCOND, s1}, Legal);
 
   setAction({G_CONSTANT, s32}, Legal);
-  for (auto Ty : {s1, s8, s16})
-    setAction({G_CONSTANT, Ty}, WidenScalar);
+  setLegalizeScalarToDifferentSizeStrategy(G_CONSTANT, 0, widen_1_8_16);
 
   setAction({G_ICMP, s1}, Legal);
-  for (auto Ty : {s8, s16})
-    setAction({G_ICMP, 1, Ty}, WidenScalar);
+  setLegalizeScalarToDifferentSizeStrategy(G_ICMP, 1,
+      widenToLargerTypesUnsupportedOtherwise);
   for (auto Ty : {s32, p0})
     setAction({G_ICMP, 1, Ty}, Legal);
 
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 4aa7e150342..7424af9d5a5 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -41,10 +41,12 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Pass.h"
@@ -53,8 +55,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/Target/ARM/ARMMacroFusion.cpp b/lib/Target/ARM/ARMMacroFusion.cpp
index a34ed2cb5a2..5c9aad417ce 100644
--- a/lib/Target/ARM/ARMMacroFusion.cpp
+++ b/lib/Target/ARM/ARMMacroFusion.cpp
@@ -15,7 +15,7 @@
 #include "ARMMacroFusion.h"
 #include "ARMSubtarget.h"
 #include "llvm/CodeGen/MacroFusion.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 namespace llvm {
 
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 4f330e3a884..f6996f098c0 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -31,6 +31,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
@@ -38,7 +39,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <bitset>
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index d911dd97b1a..a0b98a43108 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -34,7 +35,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
diff --git a/lib/Target/ARM/ThumbRegisterInfo.cpp b/lib/Target/ARM/ThumbRegisterInfo.cpp
index 15a56752333..d2bebb9eeec 100644
--- a/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -29,7 +29,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
diff --git a/lib/Target/AVR/AVRFrameLowering.h b/lib/Target/AVR/AVRFrameLowering.h
index 30ef441183a..a0ba6c95127 100644
--- a/lib/Target/AVR/AVRFrameLowering.h
+++ b/lib/Target/AVR/AVRFrameLowering.h
@@ -10,7 +10,7 @@
 #ifndef LLVM_AVR_FRAME_LOWERING_H
 #define LLVM_AVR_FRAME_LOWERING_H
 
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
 
diff --git a/lib/Target/AVR/AVRInstrInfo.h b/lib/Target/AVR/AVRInstrInfo.h
index eee8a92c619..354edcec346 100644
--- a/lib/Target/AVR/AVRInstrInfo.h
+++ b/lib/Target/AVR/AVRInstrInfo.h
@@ -14,7 +14,7 @@
 #ifndef LLVM_AVR_INSTR_INFO_H
 #define LLVM_AVR_INSTR_INFO_H
 
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 #include "AVRRegisterInfo.h"
 
diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp
index 7099b29a8bc..b6ac93452cb 100644
--- a/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -18,7 +18,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 #include "AVR.h"
 #include "AVRInstrInfo.h"
diff --git a/lib/Target/BPF/BPFFrameLowering.h b/lib/Target/BPF/BPFFrameLowering.h
index 5db963f518b..b4ffa0713fa 100644
--- a/lib/Target/BPF/BPFFrameLowering.h
+++ b/lib/Target/BPF/BPFFrameLowering.h
@@ -14,7 +14,7 @@
 #ifndef LLVM_LIB_TARGET_BPF_BPFFRAMELOWERING_H
 #define LLVM_LIB_TARGET_BPF_BPFFRAMELOWERING_H
 
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
 class BPFSubtarget;
diff --git a/lib/Target/BPF/BPFInstrInfo.h b/lib/Target/BPF/BPFInstrInfo.h
index c7048ab979b..f591f48a89a 100644
--- a/lib/Target/BPF/BPFInstrInfo.h
+++ b/lib/Target/BPF/BPFInstrInfo.h
@@ -15,7 +15,7 @@
 #define LLVM_LIB_TARGET_BPF_BPFINSTRINFO_H
 
 #include "BPFRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "BPFGenInstrInfo.inc"
diff --git a/lib/Target/BPF/BPFRegisterInfo.cpp b/lib/Target/BPF/BPFRegisterInfo.cpp
index 273843e9270..00d609e8960 100644
--- a/lib/Target/BPF/BPFRegisterInfo.cpp
+++ b/lib/Target/BPF/BPFRegisterInfo.cpp
@@ -18,10 +18,10 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 #define GET_REGINFO_TARGET_DESC
 #include "BPFGenRegisterInfo.inc"
diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index 22794eb50e2..e28af5a844f 100644
--- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -14,9 +14,9 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <cassert>
 #include <vector>
diff --git a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
index 501ac2c44bb..6336075917e 100644
--- a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
+++ b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -19,8 +19,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/PassSupport.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
index 296edbe1eff..988718860c5 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -15,7 +15,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include <vector>
 
 namespace llvm {
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index a5381c1fb1a..9b8970258a2 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -36,6 +36,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -47,7 +48,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 2f172340c4e..1558c2e9850 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -19,8 +19,8 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include <cstdint>
 #include <vector>
 
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 29e2bc32dfb..2154a485dc6 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -161,9 +161,16 @@ namespace {
   };
 
   struct Simplifier {
-    using Rule = std::function<Value * (Instruction *, LLVMContext &)>;
+    struct Rule {
+      using FuncType = std::function<Value* (Instruction*, LLVMContext&)>;
+      Rule(StringRef N, FuncType F) : Name(N), Fn(F) {}
+      StringRef Name;   // For debugging.
+      FuncType Fn;
+    };
 
-    void addRule(const Rule &R) { Rules.push_back(R); }
+    void addRule(StringRef N, const Rule::FuncType &F) {
+      Rules.push_back(Rule(N, F));
+    }
 
   private:
     struct WorkListType {
@@ -522,7 +529,7 @@ Value *Simplifier::simplify(Context &C) {
       continue;
     bool Changed = false;
     for (Rule &R : Rules) {
-      Value *W = R(U, C.Ctx);
+      Value *W = R.Fn(U, C.Ctx);
       if (!W)
         continue;
       Changed = true;
@@ -1544,8 +1551,30 @@ Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At,
   return R;
 }
 
+static bool hasZeroSignBit(const Value *V) {
+  if (const auto *CI = dyn_cast<const ConstantInt>(V))
+    return (CI->getType()->getSignBit() & CI->getSExtValue()) == 0;
+  const Instruction *I = dyn_cast<const Instruction>(V);
+  if (!I)
+    return false;
+  switch (I->getOpcode()) {
+    case Instruction::LShr:
+      if (const auto SI = dyn_cast<const ConstantInt>(I->getOperand(1)))
+        return SI->getZExtValue() > 0;
+      return false;
+    case Instruction::Or:
+    case Instruction::Xor:
+      return hasZeroSignBit(I->getOperand(0)) &&
+             hasZeroSignBit(I->getOperand(1));
+    case Instruction::And:
+      return hasZeroSignBit(I->getOperand(0)) ||
+             hasZeroSignBit(I->getOperand(1));
+  }
+  return false;
+}
+
 void PolynomialMultiplyRecognize::setupSimplifier() {
-  Simp.addRule(
+  Simp.addRule("sink-zext",
     // Sink zext past bitwise operations.
     [](Instruction *I, LLVMContext &Ctx) -> Value* {
       if (I->getOpcode() != Instruction::ZExt)
@@ -1566,7 +1595,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
                            B.CreateZExt(T->getOperand(0), I->getType()),
                            B.CreateZExt(T->getOperand(1), I->getType()));
     });
-  Simp.addRule(
+  Simp.addRule("xor/and -> and/xor",
     // (xor (and x a) (and y a)) -> (and (xor x y) a)
     [](Instruction *I, LLVMContext &Ctx) -> Value* {
       if (I->getOpcode() != Instruction::Xor)
@@ -1584,7 +1613,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
       return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)),
                          And0->getOperand(1));
     });
-  Simp.addRule(
+  Simp.addRule("sink binop into select",
     // (Op (select c x y) z) -> (select c (Op x z) (Op y z))
     // (Op x (select c y z)) -> (select c (Op x y) (Op x z))
     [](Instruction *I, LLVMContext &Ctx) -> Value* {
@@ -1610,7 +1639,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
       }
       return nullptr;
     });
-  Simp.addRule(
+  Simp.addRule("fold select-select",
     // (select c (select c x y) z) -> (select c x z)
     // (select c x (select c y z)) -> (select c x z)
     [](Instruction *I, LLVMContext &Ctx) -> Value* {
@@ -1629,23 +1658,19 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
       }
       return nullptr;
     });
-  Simp.addRule(
+  Simp.addRule("or-signbit -> xor-signbit",
     // (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
     [](Instruction *I, LLVMContext &Ctx) -> Value* {
       if (I->getOpcode() != Instruction::Or)
         return nullptr;
-      Instruction *LShr = dyn_cast<Instruction>(I->getOperand(0));
-      if (!LShr || LShr->getOpcode() != Instruction::LShr)
-        return nullptr;
-      ConstantInt *One = dyn_cast<ConstantInt>(LShr->getOperand(1));
-      if (!One || One->getZExtValue() != 1)
-        return nullptr;
       ConstantInt *Msb = dyn_cast<ConstantInt>(I->getOperand(1));
       if (!Msb || Msb->getZExtValue() != Msb->getType()->getSignBit())
         return nullptr;
-      return IRBuilder<>(Ctx).CreateXor(LShr, Msb);
+      if (!hasZeroSignBit(I->getOperand(0)))
+        return nullptr;
+      return IRBuilder<>(Ctx).CreateXor(I->getOperand(0), Msb);
     });
-  Simp.addRule(
+  Simp.addRule("sink lshr into binop",
     // (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
     [](Instruction *I, LLVMContext &Ctx) -> Value* {
       if (I->getOpcode() != Instruction::LShr)
@@ -1667,7 +1692,7 @@ void PolynomialMultiplyRecognize::setupSimplifier() {
                 B.CreateLShr(BitOp->getOperand(0), S),
                 B.CreateLShr(BitOp->getOperand(1), S));
     });
-  Simp.addRule(
+  Simp.addRule("expose bitop-const",
     // (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
     [](Instruction *I, LLVMContext &Ctx) -> Value* {
       auto IsBitOp = [](unsigned Op) -> bool {
@@ -1737,9 +1762,17 @@ bool PolynomialMultiplyRecognize::recognize() {
   // XXX: Currently this approach can modify the loop before being 100% sure
   // that the transformation can be carried out.
   bool FoundPreScan = false;
+  auto FeedsPHI = [LoopB](const Value *V) -> bool {
+    for (const Value *U : V->users()) {
+      if (const auto *P = dyn_cast<const PHINode>(U))
+        if (P->getParent() == LoopB)
+          return true;
+    }
+    return false;
+  };
   for (Instruction &In : *LoopB) {
     SelectInst *SI = dyn_cast<SelectInst>(&In);
-    if (!SI)
+    if (!SI || !FeedsPHI(SI))
       continue;
 
     Simplifier::Context C(SI);
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 93f1fd4109a..3c88eeeb8a4 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -24,12 +24,12 @@
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
index 2525d272666..6cca5a849cc 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -20,8 +20,8 @@
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 #include <cassert>
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index d432bfef7ae..05865c43f2d 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -1706,28 +1706,27 @@ multiclass Loadxim_pat<PatFrag Load, ValueType VT, PatFrag ValueMod,
   defm: Loadxgim_pat<Load, VT, ValueMod, ImmPred, MI>;
 }
 
-// Patterns to select load reg reg-indexed: Rs + Rt<<u2.
-multiclass Loadxr_pat<PatFrag Load, ValueType VT, InstHexagon MI> {
-  let AddedComplexity = 40 in
-  def: Pat<(VT (Load (add I32:$Rs, (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))),
-           (VT (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2))>;
-
-  let AddedComplexity = 20 in
-  def: Pat<(VT (Load (add I32:$Rs, I32:$Rt))),
-           (VT (MI IntRegs:$Rs, IntRegs:$Rt, 0))>;
-}
-
-// Patterns to select load reg reg-indexed: Rs + Rt<<u2 with value modifier.
-multiclass Loadxrm_pat<PatFrag Load, ValueType VT, PatFrag ValueMod,
-                       InstHexagon MI> {
-  let AddedComplexity = 40 in
-  def: Pat<(VT (Load (add I32:$Rs, (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))),
-           (VT (ValueMod (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2)))>;
+// Pattern to select load reg reg-indexed: Rs + Rt<<u2.
+class Loadxr_shl_pat<PatFrag Load, ValueType VT, InstHexagon MI>
+  : Pat<(VT (Load (add I32:$Rs, (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))),
+        (VT (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2))>;
+
+// Pattern to select load reg reg-indexed: Rs + Rt<<0.
+class Loadxr_add_pat<PatFrag Load, ValueType VT, InstHexagon MI>
+  : Pat<(VT (Load (add I32:$Rs, I32:$Rt))),
+        (VT (MI IntRegs:$Rs, IntRegs:$Rt, 0))>;
+
+// Pattern to select load reg reg-indexed: Rs + Rt<<u2 with value modifier.
+class Loadxrm_shl_pat<PatFrag Load, ValueType VT, PatFrag ValueMod,
+                      InstHexagon MI>
+  : Pat<(VT (Load (add I32:$Rs, (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))),
+        (VT (ValueMod (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2)))>;
 
-  let AddedComplexity = 20 in
-  def: Pat<(VT (Load (add I32:$Rs, I32:$Rt))),
-           (VT (ValueMod (MI IntRegs:$Rs, IntRegs:$Rt, 0)))>;
-}
+// Pattern to select load reg reg-indexed: Rs + Rt<<0 with value modifier.
+class Loadxrm_add_pat<PatFrag Load, ValueType VT, PatFrag ValueMod,
+                      InstHexagon MI>
+  : Pat<(VT (Load (add I32:$Rs, I32:$Rt))),
+        (VT (ValueMod (MI IntRegs:$Rs, IntRegs:$Rt, 0)))>;
 
 // Pattern to select load long-offset reg-indexed: Addr + Rt<<u2.
 // Don't match for u2==0, instead use reg+imm for those cases.
@@ -1777,17 +1776,19 @@ let AddedComplexity = 20 in {
   defm: Loadxi_pat<atomic_load_64,  i64, anyimm3, L2_loadrd_io>;
 }
 
-defm: Loadxim_pat<extloadi1,    i64, ToZext64, anyimm0, L2_loadrub_io>;
-defm: Loadxim_pat<extloadi8,    i64, ToZext64, anyimm0, L2_loadrub_io>;
-defm: Loadxim_pat<extloadi16,   i64, ToZext64, anyimm1, L2_loadruh_io>;
-defm: Loadxim_pat<extloadi32,   i64, ToZext64, anyimm2, L2_loadri_io>;
-defm: Loadxim_pat<zextloadi1,   i64, ToZext64, anyimm0, L2_loadrub_io>;
-defm: Loadxim_pat<zextloadi8,   i64, ToZext64, anyimm0, L2_loadrub_io>;
-defm: Loadxim_pat<zextloadi16,  i64, ToZext64, anyimm1, L2_loadruh_io>;
-defm: Loadxim_pat<zextloadi32,  i64, ToZext64, anyimm2, L2_loadri_io>;
-defm: Loadxim_pat<sextloadi8,   i64, ToSext64, anyimm0, L2_loadrb_io>;
-defm: Loadxim_pat<sextloadi16,  i64, ToSext64, anyimm1, L2_loadrh_io>;
-defm: Loadxim_pat<sextloadi32,  i64, ToSext64, anyimm2, L2_loadri_io>;
+let AddedComplexity = 30 in {
+  defm: Loadxim_pat<extloadi1,    i64, ToZext64, anyimm0, L2_loadrub_io>;
+  defm: Loadxim_pat<extloadi8,    i64, ToZext64, anyimm0, L2_loadrub_io>;
+  defm: Loadxim_pat<extloadi16,   i64, ToZext64, anyimm1, L2_loadruh_io>;
+  defm: Loadxim_pat<extloadi32,   i64, ToZext64, anyimm2, L2_loadri_io>;
+  defm: Loadxim_pat<zextloadi1,   i64, ToZext64, anyimm0, L2_loadrub_io>;
+  defm: Loadxim_pat<zextloadi8,   i64, ToZext64, anyimm0, L2_loadrub_io>;
+  defm: Loadxim_pat<zextloadi16,  i64, ToZext64, anyimm1, L2_loadruh_io>;
+  defm: Loadxim_pat<zextloadi32,  i64, ToZext64, anyimm2, L2_loadri_io>;
+  defm: Loadxim_pat<sextloadi8,   i64, ToSext64, anyimm0, L2_loadrb_io>;
+  defm: Loadxim_pat<sextloadi16,  i64, ToSext64, anyimm1, L2_loadrh_io>;
+  defm: Loadxim_pat<sextloadi32,  i64, ToSext64, anyimm2, L2_loadri_io>;
+}
 
 let AddedComplexity  = 60 in {
   def: Loadxu_pat<extloadi8,    i32,   anyimm0, L4_loadrub_ur>;
@@ -1818,26 +1819,55 @@ let AddedComplexity  = 60 in {
   def: Loadxum_pat<extloadi32,  i64, anyimm2, ToZext64, L4_loadri_ur>;
 }
 
-defm: Loadxr_pat<extloadi8,     i32, L4_loadrub_rr>;
-defm: Loadxr_pat<zextloadi8,    i32, L4_loadrub_rr>;
-defm: Loadxr_pat<sextloadi8,    i32, L4_loadrb_rr>;
-defm: Loadxr_pat<extloadi16,    i32, L4_loadruh_rr>;
-defm: Loadxr_pat<zextloadi16,   i32, L4_loadruh_rr>;
-defm: Loadxr_pat<sextloadi16,   i32, L4_loadrh_rr>;
-defm: Loadxr_pat<load,          i32, L4_loadri_rr>;
-defm: Loadxr_pat<load,          i64, L4_loadrd_rr>;
-defm: Loadxr_pat<load,          f32, L4_loadri_rr>;
-defm: Loadxr_pat<load,          f64, L4_loadrd_rr>;
-
-defm: Loadxrm_pat<extloadi8,    i64, ToZext64, L4_loadrub_rr>;
-defm: Loadxrm_pat<zextloadi8,   i64, ToZext64, L4_loadrub_rr>;
-defm: Loadxrm_pat<sextloadi8,   i64, ToSext64, L4_loadrb_rr>;
-defm: Loadxrm_pat<extloadi16,   i64, ToZext64, L4_loadruh_rr>;
-defm: Loadxrm_pat<zextloadi16,  i64, ToZext64, L4_loadruh_rr>;
-defm: Loadxrm_pat<sextloadi16,  i64, ToSext64, L4_loadrh_rr>;
-defm: Loadxrm_pat<extloadi32,   i64, ToZext64, L4_loadri_rr>;
-defm: Loadxrm_pat<zextloadi32,  i64, ToZext64, L4_loadri_rr>;
-defm: Loadxrm_pat<sextloadi32,  i64, ToSext64, L4_loadri_rr>;
+let AddedComplexity = 40 in {
+  def: Loadxr_shl_pat<extloadi8,     i32, L4_loadrub_rr>;
+  def: Loadxr_shl_pat<zextloadi8,    i32, L4_loadrub_rr>;
+  def: Loadxr_shl_pat<sextloadi8,    i32, L4_loadrb_rr>;
+  def: Loadxr_shl_pat<extloadi16,    i32, L4_loadruh_rr>;
+  def: Loadxr_shl_pat<zextloadi16,   i32, L4_loadruh_rr>;
+  def: Loadxr_shl_pat<sextloadi16,   i32, L4_loadrh_rr>;
+  def: Loadxr_shl_pat<load,          i32, L4_loadri_rr>;
+  def: Loadxr_shl_pat<load,          i64, L4_loadrd_rr>;
+  def: Loadxr_shl_pat<load,          f32, L4_loadri_rr>;
+  def: Loadxr_shl_pat<load,          f64, L4_loadrd_rr>;
+}
+
+let AddedComplexity = 20 in {
+  def: Loadxr_add_pat<extloadi8,     i32, L4_loadrub_rr>;
+  def: Loadxr_add_pat<zextloadi8,    i32, L4_loadrub_rr>;
+  def: Loadxr_add_pat<sextloadi8,    i32, L4_loadrb_rr>;
+  def: Loadxr_add_pat<extloadi16,    i32, L4_loadruh_rr>;
+  def: Loadxr_add_pat<zextloadi16,   i32, L4_loadruh_rr>;
+  def: Loadxr_add_pat<sextloadi16,   i32, L4_loadrh_rr>;
+  def: Loadxr_add_pat<load,          i32, L4_loadri_rr>;
+  def: Loadxr_add_pat<load,          i64, L4_loadrd_rr>;
+  def: Loadxr_add_pat<load,          f32, L4_loadri_rr>;
+  def: Loadxr_add_pat<load,          f64, L4_loadrd_rr>;
+}
+
+let AddedComplexity = 40 in {
+  def: Loadxrm_shl_pat<extloadi8,    i64, ToZext64, L4_loadrub_rr>;
+  def: Loadxrm_shl_pat<zextloadi8,   i64, ToZext64, L4_loadrub_rr>;
+  def: Loadxrm_shl_pat<sextloadi8,   i64, ToSext64, L4_loadrb_rr>;
+  def: Loadxrm_shl_pat<extloadi16,   i64, ToZext64, L4_loadruh_rr>;
+  def: Loadxrm_shl_pat<zextloadi16,  i64, ToZext64, L4_loadruh_rr>;
+  def: Loadxrm_shl_pat<sextloadi16,  i64, ToSext64, L4_loadrh_rr>;
+  def: Loadxrm_shl_pat<extloadi32,   i64, ToZext64, L4_loadri_rr>;
+  def: Loadxrm_shl_pat<zextloadi32,  i64, ToZext64, L4_loadri_rr>;
+  def: Loadxrm_shl_pat<sextloadi32,  i64, ToSext64, L4_loadri_rr>;
+}
+
+let AddedComplexity = 20 in {
+  def: Loadxrm_add_pat<extloadi8,    i64, ToZext64, L4_loadrub_rr>;
+  def: Loadxrm_add_pat<zextloadi8,   i64, ToZext64, L4_loadrub_rr>;
+  def: Loadxrm_add_pat<sextloadi8,   i64, ToSext64, L4_loadrb_rr>;
+  def: Loadxrm_add_pat<extloadi16,   i64, ToZext64, L4_loadruh_rr>;
+  def: Loadxrm_add_pat<zextloadi16,  i64, ToZext64, L4_loadruh_rr>;
+  def: Loadxrm_add_pat<sextloadi16,  i64, ToSext64, L4_loadrh_rr>;
+  def: Loadxrm_add_pat<extloadi32,   i64, ToZext64, L4_loadri_rr>;
+  def: Loadxrm_add_pat<zextloadi32,  i64, ToZext64, L4_loadri_rr>;
+  def: Loadxrm_add_pat<sextloadi32,  i64, ToSext64, L4_loadri_rr>;
+}
 
 // Absolute address
 
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 7d961a238ae..da53a09a6fc 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -44,12 +44,12 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/PassSupport.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <algorithm>
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index e491c757670..f29f321214c 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -26,13 +26,13 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index 68484344fde..0ff3afff5f5 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -23,7 +23,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
diff --git a/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
index a0fdc70e141..52e5dcd4638 100644
--- a/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
+++ b/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
@@ -548,14 +548,13 @@ bool HexagonVectorLoopCarriedReuse::doVLCR() {
     findValueToReuse();
     if (ReuseCandidate.isDefined()) {
       reuseValue();
-      Changed = true;
-      Continue = true;
-    }
-    std::for_each(Dependences.begin(), Dependences.end(),
-                  std::default_delete<DepChain>());
-  } while (Continue);
-  return Changed;
-}
+      Changed = true;
+      Continue = true;
+    }
+    llvm::for_each(Dependences, std::default_delete<DepChain>());
+  } while (Continue);
+  return Changed;
+}
 
 void HexagonVectorLoopCarriedReuse::findDepChainFromPHI(Instruction *I,
                                                         DepChain &D) {
diff --git a/lib/Target/Hexagon/RDFGraph.cpp b/lib/Target/Hexagon/RDFGraph.cpp
index de58ddff339..22bb8841f5f 100644
--- a/lib/Target/Hexagon/RDFGraph.cpp
+++ b/lib/Target/Hexagon/RDFGraph.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -28,7 +29,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/Target/Lanai/LanaiDelaySlotFiller.cpp b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
index 802232b0582..6b4fa777178 100644
--- a/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
+++ b/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
@@ -17,8 +17,8 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/Lanai/LanaiFrameLowering.h b/lib/Target/Lanai/LanaiFrameLowering.h
index 2f9b6c3c158..ca690d513fc 100644
--- a/lib/Target/Lanai/LanaiFrameLowering.h
+++ b/lib/Target/Lanai/LanaiFrameLowering.h
@@ -15,7 +15,7 @@
 #define LLVM_LIB_TARGET_LANAI_LANAIFRAMELOWERING_H
 
 #include "Lanai.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
 
diff --git a/lib/Target/Lanai/LanaiInstrInfo.h b/lib/Target/Lanai/LanaiInstrInfo.h
index 4387fe1af3c..f07fede67a4 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.h
+++ b/lib/Target/Lanai/LanaiInstrInfo.h
@@ -15,7 +15,7 @@
 #define LLVM_LIB_TARGET_LANAI_LANAIINSTRINFO_H
 
 #include "LanaiRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "LanaiGenInstrInfo.inc"
diff --git a/lib/Target/Lanai/LanaiMemAluCombiner.cpp b/lib/Target/Lanai/LanaiMemAluCombiner.cpp
index 7259c02194c..c29c933db74 100644
--- a/lib/Target/Lanai/LanaiMemAluCombiner.cpp
+++ b/lib/Target/Lanai/LanaiMemAluCombiner.cpp
@@ -30,8 +30,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetInstrInfo.h"
 using namespace llvm;
 
 #define GET_INSTRMAP_INFO
diff --git a/lib/Target/Lanai/LanaiRegisterInfo.cpp b/lib/Target/Lanai/LanaiRegisterInfo.cpp
index 6ea477dce3e..56a5e0ea2de 100644
--- a/lib/Target/Lanai/LanaiRegisterInfo.cpp
+++ b/lib/Target/Lanai/LanaiRegisterInfo.cpp
@@ -20,11 +20,11 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 #define GET_REGINFO_TARGET_DESC
 #include "LanaiGenRegisterInfo.inc"
diff --git a/lib/Target/Lanai/LanaiSubtarget.h b/lib/Target/Lanai/LanaiSubtarget.h
index 2732ef3097e..313d950e8aa 100644
--- a/lib/Target/Lanai/LanaiSubtarget.h
+++ b/lib/Target/Lanai/LanaiSubtarget.h
@@ -19,7 +19,7 @@
 #include "LanaiInstrInfo.h"
 #include "LanaiSelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
diff --git a/lib/Target/Lanai/LanaiTargetMachine.h b/lib/Target/Lanai/LanaiTargetMachine.h
index ce1271d9dea..2fb1a053610 100644
--- a/lib/Target/Lanai/LanaiTargetMachine.h
+++ b/lib/Target/Lanai/LanaiTargetMachine.h
@@ -19,7 +19,7 @@
 #include "LanaiInstrInfo.h"
 #include "LanaiSelectionDAGInfo.h"
 #include "LanaiSubtarget.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h
index fdc4aa52a19..8807101f37c 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/lib/Target/MSP430/MSP430FrameLowering.h
@@ -15,7 +15,7 @@
 #define LLVM_LIB_TARGET_MSP430_MSP430FRAMELOWERING_H
 
 #include "MSP430.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
 class MSP430FrameLowering : public TargetFrameLowering {
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index d81f17e753c..45357f54c9c 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -15,7 +15,7 @@
 #define LLVM_LIB_TARGET_MSP430_MSP430INSTRINFO_H
 
 #include "MSP430RegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "MSP430GenInstrInfo.inc"
diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h
index 97b5e810a1d..4935b80cfdd 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.h
+++ b/lib/Target/MSP430/MSP430TargetMachine.h
@@ -16,7 +16,7 @@
 #define LLVM_LIB_TARGET_MSP430_MSP430TARGETMACHINE_H
 
 #include "MSP430Subtarget.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 002fa512b21..d8e2eef6a9f 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -535,7 +535,7 @@ static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
                                            uint64_t Address,
                                            const void *Decoder);
 
-static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair,
                                        uint64_t Address,
                                        const void *Decoder);
 
@@ -2481,10 +2481,8 @@ static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned RegPair,
                                        uint64_t Address, const void *Decoder) {
-  unsigned RegPair = fieldFromInstruction(Insn, 7, 3);
-
   switch (RegPair) {
   default:
     return MCDisassembler::Fail;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 12f7638594d..eae0f975080 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -1115,6 +1115,29 @@ MipsMCCodeEmitter::getMovePRegPairOpValue(const MCInst &MI, unsigned OpNo,
 }
 
 unsigned
+MipsMCCodeEmitter::getMovePRegSingleOpValue(const MCInst &MI, unsigned OpNo,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  assert(((OpNo == 2) || (OpNo == 3)) &&
+         "Unexpected OpNo for movep operand encoding!");
+
+  MCOperand Op = MI.getOperand(OpNo);
+  assert(Op.isReg() && "Operand of movep is not a register!");
+  switch (Op.getReg()) {
+  default:
+    llvm_unreachable("Unknown register for movep!");
+  case Mips::ZERO:  return 0;
+  case Mips::S1:    return 1;
+  case Mips::V0:    return 2;
+  case Mips::V1:    return 3;
+  case Mips::S0:    return 4;
+  case Mips::S2:    return 5;
+  case Mips::S3:    return 6;
+  case Mips::S4:    return 7;
+  }
+}
+
+unsigned
 MipsMCCodeEmitter::getSimm23Lsl2Encoding(const MCInst &MI, unsigned OpNo,
                                          SmallVectorImpl<MCFixup> &Fixups,
                                          const MCSubtargetInfo &STI) const {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index d12d3195521..1e840114b2b 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -252,6 +252,9 @@ public:
   unsigned getMovePRegPairOpValue(const MCInst &MI, unsigned OpNo,
                                   SmallVectorImpl<MCFixup> &Fixups,
                                   const MCSubtargetInfo &STI) const;
+  unsigned getMovePRegSingleOpValue(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
 
   unsigned getSimm23Lsl2Encoding(const MCInst &MI, unsigned OpNo,
                                  SmallVectorImpl<MCFixup> &Fixups,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index aad6bf378ea..0bddba78145 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -246,8 +246,6 @@ void MipsMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
     break;
   case MEK_CALL_HI16:
   case MEK_CALL_LO16:
-  case MEK_DTPREL_HI:
-  case MEK_DTPREL_LO:
   case MEK_GOT:
   case MEK_GOT_CALL:
   case MEK_GOT_DISP:
@@ -263,14 +261,16 @@ void MipsMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
   case MEK_NEG:
   case MEK_PCREL_HI16:
   case MEK_PCREL_LO16:
-  case MEK_TLSLDM:
     // If we do have nested target-specific expressions, they will be in
     // a consecutive chain.
     if (const MipsMCExpr *E = dyn_cast<const MipsMCExpr>(getSubExpr()))
       E->fixELFSymbolsInTLSFixups(Asm);
     break;
-  case MEK_GOTTPREL:
+  case MEK_DTPREL_HI:
+  case MEK_DTPREL_LO:
+  case MEK_TLSLDM:
   case MEK_TLSGD:
+  case MEK_GOTTPREL:
   case MEK_TPREL_HI:
   case MEK_TPREL_LO:
     fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
diff --git a/lib/Target/Mips/MicroMips32r6InstrFormats.td b/lib/Target/Mips/MicroMips32r6InstrFormats.td
index 2f0933277e8..e1f1f9262b9 100644
--- a/lib/Target/Mips/MicroMips32r6InstrFormats.td
+++ b/lib/Target/Mips/MicroMips32r6InstrFormats.td
@@ -829,6 +829,21 @@ class POOL16C_NOT16_FM_MMR6 : MicroMipsR6Inst16 {
   let Inst{3-0}   = 0b0000;
 }
 
+class POOL16C_MOVEP16_FM_MMR6 : MicroMipsR6Inst16 {
+  bits<3> dst_regs;
+  bits<3> rt;
+  bits<3> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0b010001;
+  let Inst{9-7}   = dst_regs;
+  let Inst{6-4}   = rt;
+  let Inst{3}     = rs{2};
+  let Inst{2}     = 0b1;
+  let Inst{1-0}   = rs{1-0};
+}
+
 class POOL16C_OR16_XOR16_FM_MMR6<bits<4> op> : MicroMipsR6Inst16 {
   bits<3> rt;
   bits<3> rs;
diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td
index 425e75e14c8..49d6ae3f98a 100644
--- a/lib/Target/Mips/MicroMips32r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -229,6 +229,7 @@ class SRL16_MMR6_ENC : SHIFT_FM_MM16<1>, MicroMipsR6Inst16;
 class BREAK16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b011011>;
 class LI16_MMR6_ENC : LI_FM_MM16;
 class MOVE16_MMR6_ENC : MOVE_FM_MM16<0b000011>;
+class MOVEP_MMR6_ENC  : POOL16C_MOVEP16_FM_MMR6;
 class SDBBP16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b111011>;
 class SUBU16_MMR6_ENC : POOL16A_SUBU16_FM_MMR6;
 class XOR16_MMR6_ENC : POOL16C_OR16_XOR16_FM_MMR6<0b1000>;
@@ -1204,6 +1205,7 @@ class LI16_MMR6_DESC : LoadImmMM16<"li16", li16_imm, GPRMM16Opnd>,
       MMR6Arch<"li16">, MicroMipsR6Inst16, IsAsCheapAsAMove;
 class MOVE16_MMR6_DESC : MoveMM16<"move16", GPR32Opnd>, MMR6Arch<"move16">,
       MicroMipsR6Inst16;
+class MOVEP_MMR6_DESC : MovePMM16<"movep", GPRMM16OpndMoveP>, MMR6Arch<"movep">;
 class SDBBP16_MMR6_DESC : BrkSdbbp16MM<"sdbbp16", II_SDBBP>, MMR6Arch<"sdbbp16">,
       MicroMipsR6Inst16;
 class SUBU16_MMR6_DESC : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
@@ -1679,6 +1681,8 @@ def LI16_MMR6 : StdMMR6Rel, LI16_MMR6_DESC, LI16_MMR6_ENC,
                 ISA_MICROMIPS32R6;
 def MOVE16_MMR6 : StdMMR6Rel, MOVE16_MMR6_DESC, MOVE16_MMR6_ENC,
                   ISA_MICROMIPS32R6;
+def MOVEP_MMR6  : StdMMR6Rel, MOVEP_MMR6_DESC, MOVEP_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
 def SDBBP16_MMR6 : StdMMR6Rel, SDBBP16_MMR6_DESC, SDBBP16_MMR6_ENC,
                    ISA_MICROMIPS32R6;
 def SUBU16_MMR6 : StdMMR6Rel, SUBU16_MMR6_DESC, SUBU16_MMR6_ENC,
@@ -1879,3 +1883,10 @@ let AddedComplexity = 41 in {
 }
 
 def TAILCALL_MMR6 : TailCall<BC_MMR6, brtarget26_mm>, ISA_MICROMIPS32R6;
+
+def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
+              (TAILCALL_MMR6 tglobaladdr:$dst)>, ISA_MICROMIPS32R6;
+
+def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
+              (TAILCALL_MMR6 texternalsym:$dst)>, ISA_MICROMIPS32R6;
+
diff --git a/lib/Target/Mips/MicroMips64r6InstrInfo.td b/lib/Target/Mips/MicroMips64r6InstrInfo.td
index e0f4d833392..4f705feed0a 100644
--- a/lib/Target/Mips/MicroMips64r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips64r6InstrInfo.td
@@ -162,12 +162,11 @@ class DCLZ_MM64R6_DESC {
 
 class DINSU_MM64R6_DESC : InsBase<"dinsu", GPR64Opnd, uimm5_plus32,
                                   uimm5_inssize_plus1, immZExt5Plus32,
-                                  immZExt5Plus1, MipsIns>;
+                                  immZExt5Plus1>;
 class DINSM_MM64R6_DESC : InsBase<"dinsm", GPR64Opnd, uimm5, uimm_range_2_64,
-                                  immZExt5, immZExtRange2To64, MipsIns>;
+                                  immZExt5, immZExtRange2To64>;
 class DINS_MM64R6_DESC : InsBase<"dins", GPR64Opnd, uimm5_report_uimm6,
-                                 uimm5_inssize_plus1, immZExt5, immZExt5Plus1,
-                                 MipsIns>;
+                                 uimm5_inssize_plus1, immZExt5, immZExt5Plus1>;
 class DMTC0_MM64R6_DESC : MTC0_MMR6_DESC_BASE<"dmtc0", COP0Opnd, GPR64Opnd,
                                               II_DMTC0>;
 class DMTC1_MM64R6_DESC : MTC1_MMR6_DESC_BASE<"dmtc1", FGR64Opnd, GPR64Opnd,
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 1f869db4efe..48c1d94d03c 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -631,7 +631,8 @@ def ADDIUSP_MM : AddImmUSP<"addiusp">, ADDIUSP_FM_MM16;
 def MFHI16_MM : MoveFromHILOMM<"mfhi", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x10>;
 def MFLO16_MM : MoveFromHILOMM<"mflo", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x12>;
 def MOVE16_MM : MoveMM16<"move", GPR32Opnd>, MOVE_FM_MM16<0x03>;
-def MOVEP_MM : MovePMM16<"movep", GPRMM16OpndMoveP>, MOVEP_FM_MM16;
+def MOVEP_MM : MovePMM16<"movep", GPRMM16OpndMoveP>, MOVEP_FM_MM16,
+               ISA_MICROMIPS_NOT_32R6_64R6;
 def LI16_MM : LoadImmMM16<"li16", li16_imm, GPRMM16Opnd>, LI_FM_MM16,
               IsAsCheapAsAMove;
 def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>,
@@ -884,7 +885,7 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1, immZExt5,
                               immZExt5Plus1, MipsExt>, EXT_FM_MM<0x2c>;
   def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, uimm5_inssize_plus1,
-                              immZExt5, immZExt5Plus1, MipsIns>,
+                              immZExt5, immZExt5Plus1>,
                EXT_FM_MM<0x0c>;
 
   /// Jump Instructions
@@ -1061,13 +1062,13 @@ let Predicates = [InMicroMips] in {
                 (LW_MM addr:$addr)>;
   def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
                 (SUBu_MM GPR32:$lhs, GPR32:$rhs)>;
-
-  def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
-                (TAILCALL_MM tglobaladdr:$dst)>, ISA_MIPS1_NOT_32R6_64R6;
-  def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
-                (TAILCALL_MM texternalsym:$dst)>, ISA_MIPS1_NOT_32R6_64R6;
 }
 
+def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
+              (TAILCALL_MM tglobaladdr:$dst)>, ISA_MICROMIPS32_NOT_MIPS32R6;
+def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
+              (TAILCALL_MM texternalsym:$dst)>, ISA_MICROMIPS32_NOT_MIPS32R6;
+
 let AddedComplexity = 40 in {
   def : MipsPat<(i32 (sextloadi16 addrRegImm:$a)),
                 (LH_MM addrRegImm:$a)>;
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index 76bca3df2bc..cb59e2ddb1c 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -30,7 +30,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include <cassert>
 #include <cstdint>
 #include <vector>
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index bdb9eec4cc5..8ce47e3f669 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -17,8 +17,8 @@
 #include "MipsRegisterInfo.h"
 #include "MipsTargetMachine.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp
index 44771cbe8be..ff95f3c7228 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -22,6 +22,8 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
@@ -29,8 +31,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 04a050c2ff4..dbd47de4dad 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -341,13 +341,13 @@ let AdditionalPredicates = [NotInMicroMips] in {
   // for dinsm and dinsu like binutils.
   let DecoderMethod = "DecodeDINS" in {
     def DINS  : InsBase<"dins", GPR64Opnd, uimm6, uimm5_inssize_plus1,
-                        immZExt5, immZExt5Plus1, MipsIns>, EXT_FM<7>,
+                        immZExt5, immZExt5Plus1>, EXT_FM<7>,
                 ISA_MIPS64R2;
     def DINSU : InsBase<"dinsu", GPR64Opnd, uimm5_plus32, uimm5_inssize_plus1,
-                        immZExt5Plus32, immZExt5Plus1, MipsIns>,
+                        immZExt5Plus32, immZExt5Plus1>,
                 EXT_FM<6>, ISA_MIPS64R2;
     def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5, uimm_range_2_64,
-                        immZExt5, immZExtRange2To64, MipsIns>,
+                        immZExt5, immZExtRange2To64>,
                 EXT_FM<5>, ISA_MIPS64R2;
   }
 }
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index bec0ae6ba4c..5edd12c0232 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -37,6 +37,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
@@ -64,7 +65,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include <algorithm>
 #include <cassert>
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index ef05166503b..27a85970da6 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -107,38 +107,31 @@ bool MipsFrameLowering::hasBP(const MachineFunction &MF) const {
   return MFI.hasVarSizedObjects() && TRI->needsStackRealignment(MF);
 }
 
+// Estimate the size of the stack, including the incoming arguments. We need to
+// account for register spills, local objects, reserved call frame and incoming
+// arguments. This is required to determine the largest possible positive offset
+// from $sp so that it can be determined if an emergency spill slot for stack
+// addresses is required.
 uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
 
-  int64_t Offset = 0;
+  int64_t Size = 0;
 
-  // Iterate over fixed sized objects.
+  // Iterate over fixed sized objects which are incoming arguments.
   for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
-    Offset = std::max(Offset, -MFI.getObjectOffset(I));
+    if (MFI.getObjectOffset(I) > 0)
+      Size += MFI.getObjectSize(I);
 
   // Conservatively assume all callee-saved registers will be saved.
   for (const MCPhysReg *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) {
-    unsigned Size = TRI.getSpillSize(*TRI.getMinimalPhysRegClass(*R));
-    Offset = alignTo(Offset + Size, Size);
+    unsigned RegSize = TRI.getSpillSize(*TRI.getMinimalPhysRegClass(*R));
+    Size = alignTo(Size + RegSize, RegSize);
   }
 
-  unsigned MaxAlign = MFI.getMaxAlignment();
-
-  // Check that MaxAlign is not zero if there is a stack object that is not a
-  // callee-saved spill.
-  assert(!MFI.getObjectIndexEnd() || MaxAlign);
-
-  // Iterate over other objects.
-  for (unsigned I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I)
-    Offset = alignTo(Offset + MFI.getObjectSize(I), MaxAlign);
-
-  // Call frame.
-  if (MFI.adjustsStack() && hasReservedCallFrame(MF))
-    Offset = alignTo(Offset + MFI.getMaxCallFrameSize(),
-                     std::max(MaxAlign, getStackAlignment()));
-
-  return alignTo(Offset, getStackAlignment());
+  // Get the size of the rest of the frame objects and any possible reserved
+  // call frame, accounting for alignment.
+  return Size + MFI.estimateStackSize(MF);
 }
 
 // Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index 8c4214c4c21..883c3267d51 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -15,7 +15,7 @@
 #define LLVM_LIB_TARGET_MIPS_MIPSFRAMELOWERING_H
 
 #include "Mips.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
   class MipsSubtarget;
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 38b3c3fb160..d31385f42d6 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -27,8 +27,8 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
@@ -45,6 +45,8 @@
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
@@ -62,8 +64,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index a5ed1be3bee..c18e395f901 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -25,7 +25,7 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include <cstdint>
 
 #define GET_INSTRINFO_HEADER
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index c4c3eb760c5..3502dbcdae9 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -212,6 +212,8 @@ def HasMicroMips64r6 : Predicate<"Subtarget->inMicroMips64r6Mode()">,
                        AssemblerPredicate<"FeatureMicroMips,FeatureMips64r6">;
 def InMips16Mode :    Predicate<"Subtarget->inMips16Mode()">,
                       AssemblerPredicate<"FeatureMips16">;
+def NotInMips16Mode : Predicate<"!Subtarget->inMips16Mode()">,
+                      AssemblerPredicate<"!FeatureMips16">;
 def HasCnMips    :    Predicate<"Subtarget->hasCnMips()">,
                       AssemblerPredicate<"FeatureCnMips">;
 def NotCnMips    :    Predicate<"!Subtarget->hasCnMips()">,
@@ -1544,7 +1546,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
     PseudoInstExpansion<(JumpInst Opnd:$target)>;
 
   class TailCallReg<RegisterOperand RO> :
-    MipsPseudo<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>;
+    PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>;
 }
 
 class BAL_BR_Pseudo<Instruction RealInst> :
@@ -1726,12 +1728,13 @@ class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
          [(set RO:$rt, (Op RO:$rs, PosImm:$pos, SizeImm:$size))], II_EXT,
          FrmR, opstr>, ISA_MIPS32R2;
 
+// 'ins' and its' 64 bit variants are matched by C++ code.
 class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd,
-              Operand SizeOpnd, PatFrag PosImm, PatFrag SizeImm,
-              SDPatternOperator Op = null_frag>:
+              Operand SizeOpnd, PatFrag PosImm, PatFrag SizeImm>:
   InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size, RO:$src),
          !strconcat(opstr, " $rt, $rs, $pos, $size"),
-         [(set RO:$rt, (Op RO:$rs, PosImm:$pos, SizeImm:$size, RO:$src))],
+         [(set RO:$rt, (null_frag RO:$rs, PosImm:$pos, SizeImm:$size,
+                                  RO:$src))],
          II_INS, FrmR, opstr>, ISA_MIPS32R2 {
   let Constraints = "$src = $rt";
 }
@@ -2086,7 +2089,7 @@ def BLTZALL : MMRel, BGEZAL_FT<"bltzall", brtarget, GPR32Opnd>,
               BGEZAL_FM<0x12>, ISA_MIPS2_NOT_32R6_64R6;
 def BAL_BR : BAL_BR_Pseudo<BGEZAL>;
 
-let Predicates = [NotInMicroMips] in { 
+let AdditionalPredicates = [NotInMips16Mode, NotInMicroMips] in {
   def TAILCALL : TailCall<J, jmptarget>;
 }
 
@@ -2103,6 +2106,7 @@ class PseudoIndirectBranchBase<RegisterOperand RO> :
   let isBranch = 1;
   let isIndirectBranch = 1;
   bit isCTI = 1;
+  let Predicates = [NotInMips16Mode];
 }
 
 def PseudoIndirectBranch : PseudoIndirectBranchBase<GPR32Opnd>;
@@ -2236,7 +2240,7 @@ let AdditionalPredicates = [NotInMicroMips] in {
             EXT_FM<0>;
   def INS : MMRel, StdMMR6Rel, InsBase<"ins", GPR32Opnd, uimm5,
                                        uimm5_inssize_plus1, immZExt5,
-                                       immZExt5Plus1, MipsIns>,
+                                       immZExt5Plus1>,
             EXT_FM<4>;
 }
 /// Move Control Registers From/To CPU Registers
@@ -2776,10 +2780,12 @@ def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
 //              (JALR GPR32:$dst)>;
 
 // Tail call
-def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
-              (TAILCALL tglobaladdr:$dst)>;
-def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
-              (TAILCALL texternalsym:$dst)>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
+                (TAILCALL tglobaladdr:$dst)>;
+  def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
+                (TAILCALL texternalsym:$dst)>;
+}
 // hi/lo relocs
 multiclass MipsHiLoRelocs<Instruction Lui, Instruction Addiu,
                           Register ZeroReg, RegisterOperand GPROpnd> {
diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp
index 01c0cbf8262..3910adb7316 100644
--- a/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -28,11 +28,11 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/RecyclingAllocator.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 9c64a0ecbb1..ec966afee0e 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -28,7 +28,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <cstdint>
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 08fb3d7d435..f64d91aad85 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -616,6 +616,7 @@ def GPRMM16OpndZero : RegisterOperand<GPRMM16Zero> {
 
 def GPRMM16OpndMoveP : RegisterOperand<GPRMM16MoveP> {
   let ParserMatchClass = GPRMM16AsmOperandMoveP;
+  let EncoderMethod = "getMovePRegSingleOpValue";
 }
 
 def GPR64Opnd : RegisterOperand<GPR64> {
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index 0b19b18449e..2d9cbabbc59 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -29,6 +29,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCDwarf.h"
@@ -37,7 +38,6 @@
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <cassert>
@@ -893,10 +893,12 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
   }
 
   // Set scavenging frame index if necessary.
-  uint64_t MaxSPOffset = MF.getInfo<MipsFunctionInfo>()->getIncomingArgSize() +
-    estimateStackSize(MF);
+  uint64_t MaxSPOffset = estimateStackSize(MF);
 
-  if (isInt<16>(MaxSPOffset))
+  // MSA has a minimum offset of 10 bits signed. If there is a variable
+  // sized object on the stack, the estimation cannot account for it.
+  if (isIntN(STI.hasMSA() ? 10 : 16, MaxSPOffset) &&
+      !MF.getFrameInfo().hasVarSizedObjects())
     return;
 
   const TargetRegisterClass &RC =
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 283fcaa73a7..3c6a7d7a665 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -905,6 +905,64 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
     break;
   }
 
+  // Manually match MipsISD::Ins nodes to get the correct instruction. It has
+  // to be done in this fashion so that we respect the differences between
+  // dins and dinsm, as the difference is that the size operand has the range
+  // 0 < size <= 32 for dins while dinsm has the range 2 <= size <= 64 which
+  // means SelectionDAGISel would have to test all the operands at once to
+  // match the instruction.
+  case MipsISD::Ins: {
+
+    // Sanity checking for the node operands.
+    if (Node->getValueType(0) != MVT::i32 && Node->getValueType(0) != MVT::i64)
+      return false;
+
+    if (Node->getNumOperands() != 4)
+      return false;
+
+    if (Node->getOperand(1)->getOpcode() != ISD::Constant ||
+        Node->getOperand(2)->getOpcode() != ISD::Constant)
+      return false;
+
+    MVT ResTy = Node->getSimpleValueType(0);
+    uint64_t Pos = Node->getConstantOperandVal(1);
+    uint64_t Size = Node->getConstantOperandVal(2);
+
+    // Size has to be >0 for 'ins', 'dins' and 'dinsu'.
+    if (!Size)
+      return false;
+
+    if (Pos + Size > 64)
+      return false;
+
+    if (ResTy != MVT::i32 && ResTy != MVT::i64)
+      return false;
+
+    unsigned Opcode = 0;
+    if (ResTy == MVT::i32) {
+      if (Pos + Size <= 32)
+        Opcode = Mips::INS;
+    } else {
+      if (Pos + Size <= 32)
+        Opcode = Mips::DINS;
+      else if (Pos < 32 && 1 < Size)
+        Opcode = Mips::DINSM;
+      else
+        Opcode = Mips::DINSU;
+    }
+
+    if (Opcode) {
+      SDValue Ops[4] = {
+          Node->getOperand(0), CurDAG->getTargetConstant(Pos, DL, MVT::i32),
+          CurDAG->getTargetConstant(Size, DL, MVT::i32), Node->getOperand(3)};
+
+      ReplaceNode(Node, CurDAG->getMachineNode(Opcode, DL, ResTy, Ops));
+      return true;
+    }
+
+    return false;
+  }
+
   case MipsISD::ThreadPointer: {
     EVT PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout());
     unsigned RdhwrOpc, DestReg;
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 45d7f94f1d1..4dd9f7f219a 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -31,6 +31,7 @@
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Intrinsics.h"
@@ -40,7 +41,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 #include <cassert>
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index 86bd24166bb..2ff6b99e78f 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -23,6 +23,8 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
@@ -30,8 +32,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
diff --git a/lib/Target/Mips/MipsScheduleGeneric.td b/lib/Target/Mips/MipsScheduleGeneric.td
index 89cda676441..9621009ed1c 100644
--- a/lib/Target/Mips/MipsScheduleGeneric.td
+++ b/lib/Target/Mips/MipsScheduleGeneric.td
@@ -736,6 +736,7 @@ def : InstRW<[GenericDSPShort], (instregex "^MFHI_DSP_MM$")>;
 def : InstRW<[GenericDSPShort], (instregex "^MFLO_DSP_MM$")>;
 def : InstRW<[GenericDSPShort], (instregex "^MODSUB_MM$")>;
 def : InstRW<[GenericDSPShort], (instregex "^MOVEP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MOVEP_MMR6$")>;
 def : InstRW<[GenericDSPShort], (instregex "^MOVN_I_MM$")>;
 def : InstRW<[GenericDSPShort], (instregex "^MOVZ_I_MM$")>;
 def : InstRW<[GenericDSPShort], (instregex "^MSUBU_DSP_MM$")>;
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 6ced2f6967c..729f3ed7b79 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -20,8 +20,8 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/MachineLocation.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h
index 320ca9a2f09..a802cf85d2e 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -14,7 +14,7 @@
 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H
 #define LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H
 
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
 class NVPTXSubtarget;
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7b9acb20b75..ac4f2544fc3 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -3449,6 +3449,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   }
 
   case Intrinsic::nvvm_atomic_load_add_f32:
+  case Intrinsic::nvvm_atomic_load_add_f64:
   case Intrinsic::nvvm_atomic_load_inc_32:
   case Intrinsic::nvvm_atomic_load_dec_32:
 
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
index d284282e28c..18ba7684ae5 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -16,7 +16,7 @@
 
 #include "NVPTX.h"
 #include "NVPTXRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "NVPTXGenInstrInfo.inc"
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index f745b6f6635..478f3e9d057 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1095,6 +1095,12 @@ def atomic_load_add_f32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
   (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
 def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
   (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+def atomic_load_add_f64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>;
+def atomic_load_add_f64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>;
+def atomic_load_add_f64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f64 node:$a, node:$b)>;
 
 defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".add",
   atomic_load_add_32_g, i32imm, imm, hasAtomRedG32>;
@@ -1121,6 +1127,13 @@ defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<Float32Regs, ".shared", ".f32", ".add",
 defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<Float32Regs, "", ".f32", ".add",
   atomic_load_add_f32_gen, f32imm, fpimm, hasAtomAddF32>;
 
+defm INT_PTX_ATOM_ADD_G_F64 : F_ATOMIC_2<Float64Regs, ".global", ".f64", ".add",
+  atomic_load_add_f64_g, f64imm, fpimm, hasAtomAddF64>;
+defm INT_PTX_ATOM_ADD_S_F64 : F_ATOMIC_2<Float64Regs, ".shared", ".f64", ".add",
+  atomic_load_add_f64_s, f64imm, fpimm, hasAtomAddF64>;
+defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2<Float64Regs, "", ".f64", ".add",
+  atomic_load_add_f64_gen, f64imm, fpimm, hasAtomAddF64>;
+
 // atom_sub
 
 def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
diff --git a/lib/Target/NVPTX/NVPTXPeephole.cpp b/lib/Target/NVPTX/NVPTXPeephole.cpp
index 4e902c0fb50..38437b20433 100644
--- a/lib/Target/NVPTX/NVPTXPeephole.cpp
+++ b/lib/Target/NVPTX/NVPTXPeephole.cpp
@@ -36,7 +36,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 88288abe64f..3957d426653 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -20,7 +20,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 8d46694fbe5..75573832988 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -18,8 +18,8 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/MachineLocation.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
index 7674135f0a7..54a72a688ee 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -17,7 +17,7 @@
 #include "ManagedStringPool.h"
 #include "NVPTXSubtarget.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
diff --git a/lib/Target/Nios2/Nios2FrameLowering.h b/lib/Target/Nios2/Nios2FrameLowering.h
index 2aaea678d9e..2d9e84b2c72 100644
--- a/lib/Target/Nios2/Nios2FrameLowering.h
+++ b/lib/Target/Nios2/Nios2FrameLowering.h
@@ -14,7 +14,7 @@
 #define LLVM_LIB_TARGET_NIOS2_NIOS2FRAMELOWERING_H
 
 #include "Nios2.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
 class Nios2Subtarget;
diff --git a/lib/Target/Nios2/Nios2InstrInfo.h b/lib/Target/Nios2/Nios2InstrInfo.h
index 47e5e83a39d..6a0a050c839 100644
--- a/lib/Target/Nios2/Nios2InstrInfo.h
+++ b/lib/Target/Nios2/Nios2InstrInfo.h
@@ -18,7 +18,7 @@
 #include "Nios2RegisterInfo.h"
 
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "Nios2GenInstrInfo.inc"
diff --git a/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/lib/Target/PowerPC/PPCBranchCoalescing.cpp
index 33085a42361..ac28d7ff497 100644
--- a/lib/Target/PowerPC/PPCBranchCoalescing.cpp
+++ b/lib/Target/PowerPC/PPCBranchCoalescing.cpp
@@ -21,9 +21,9 @@
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index fa813db5fef..f845d5a9ac6 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -15,7 +15,7 @@
 
 #include "PPC.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 8ea3689b08e..2092748ca1a 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -36,6 +36,7 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DebugLoc.h"
@@ -53,7 +54,6 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <algorithm>
 #include <cassert>
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index f3e7b4af45d..62ade966145 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -51,6 +51,7 @@
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
@@ -82,7 +83,6 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -291,14 +291,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::FROUND, MVT::f32, Legal);
   }
 
-  // PowerPC does not have BSWAP
+  // PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd
+  // to speed up scalar BSWAP64.
   // CTPOP or CTTZ were introduced in P8/P9 respectivelly
   setOperationAction(ISD::BSWAP, MVT::i32  , Expand);
-  setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
   if (Subtarget.isISA3_0()) {
+    setOperationAction(ISD::BSWAP, MVT::i64  , Custom);
     setOperationAction(ISD::CTTZ , MVT::i32  , Legal);
     setOperationAction(ISD::CTTZ , MVT::i64  , Legal);
   } else {
+    setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
     setOperationAction(ISD::CTTZ , MVT::i32  , Expand);
     setOperationAction(ISD::CTTZ , MVT::i64  , Expand);
   }
@@ -781,6 +783,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::SRL, MVT::v1i128, Legal);
       setOperationAction(ISD::SRA, MVT::v1i128, Expand);
     }
+
+    if (Subtarget.hasP9Altivec()) {
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+    }
   }
 
   if (Subtarget.hasQPX()) {
@@ -7888,6 +7895,107 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
   return DAG.getNode(ISD::BITCAST, dl, VT, T);
 }
 
+/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled
+/// by the VINSERTB instruction introduced in ISA 3.0, else just return default
+/// SDValue.
+SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N,
+                                           SelectionDAG &DAG) const {
+  const unsigned BytesInVector = 16;
+  bool IsLE = Subtarget.isLittleEndian();
+  SDLoc dl(N);
+  SDValue V1 = N->getOperand(0);
+  SDValue V2 = N->getOperand(1);
+  unsigned ShiftElts = 0, InsertAtByte = 0;
+  bool Swap = false;
+
+  // Shifts required to get the byte we want at element 7.
+  unsigned LittleEndianShifts[] = {8, 7,  6,  5,  4,  3,  2,  1,
+                                   0, 15, 14, 13, 12, 11, 10, 9};
+  unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0,
+                                1, 2,  3,  4,  5,  6,  7,  8};
+
+  ArrayRef<int> Mask = N->getMask();
+  int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+  // For each mask element, find out if we're just inserting something
+  // from V2 into V1 or vice versa.
+  // Possible permutations inserting an element from V2 into V1:
+  //   X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  //   0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  //   ...
+  //   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X
+  // Inserting from V1 into V2 will be similar, except mask range will be
+  // [16,31].
+
+  bool FoundCandidate = false;
+  // If both vector operands for the shuffle are the same vector, the mask
+  // will contain only elements from the first one and the second one will be
+  // undef.
+  unsigned VINSERTBSrcElem = IsLE ? 8 : 7;
+  // Go through the mask of half-words to find an element that's being moved
+  // from one vector to the other.
+  for (unsigned i = 0; i < BytesInVector; ++i) {
+    unsigned CurrentElement = Mask[i];
+    // If 2nd operand is undefined, we should only look for element 7 in the
+    // Mask.
+    if (V2.isUndef() && CurrentElement != VINSERTBSrcElem)
+      continue;
+
+    bool OtherElementsInOrder = true;
+    // Examine the other elements in the Mask to see if they're in original
+    // order.
+    for (unsigned j = 0; j < BytesInVector; ++j) {
+      if (j == i)
+        continue;
+      // If CurrentElement is from V1 [0,15], then we the rest of the Mask to be
+      // from V2 [16,31] and vice versa.  Unless the 2nd operand is undefined,
+      // in which we always assume we're always picking from the 1st operand.
+      int MaskOffset =
+          (!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0;
+      if (Mask[j] != OriginalOrder[j] + MaskOffset) {
+        OtherElementsInOrder = false;
+        break;
+      }
+    }
+    // If other elements are in original order, we record the number of shifts
+    // we need to get the element we want into element 7. Also record which byte
+    // in the vector we should insert into.
+    if (OtherElementsInOrder) {
+      // If 2nd operand is undefined, we assume no shifts and no swapping.
+      if (V2.isUndef()) {
+        ShiftElts = 0;
+        Swap = false;
+      } else {
+        // Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4.
+        ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF]
+                         : BigEndianShifts[CurrentElement & 0xF];
+        Swap = CurrentElement < BytesInVector;
+      }
+      InsertAtByte = IsLE ? BytesInVector - (i + 1) : i;
+      FoundCandidate = true;
+      break;
+    }
+  }
+
+  if (!FoundCandidate)
+    return SDValue();
+
+  // Candidate found, construct the proper SDAG sequence with VINSERTB,
+  // optionally with VECSHL if shift is required.
+  if (Swap)
+    std::swap(V1, V2);
+  if (V2.isUndef())
+    V2 = V1;
+  if (ShiftElts) {
+    SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
+                              DAG.getConstant(ShiftElts, dl, MVT::i32));
+    return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl,
+                       DAG.getConstant(InsertAtByte, dl, MVT::i32));
+  }
+  return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2,
+                     DAG.getConstant(InsertAtByte, dl, MVT::i32));
+}
+
 /// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled
 /// by the VINSERTH instruction introduced in ISA 3.0, else just return default
 /// SDValue.
@@ -8035,8 +8143,11 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   }
 
   if (Subtarget.hasP9Altivec()) {
-    SDValue NewISDNode = lowerToVINSERTH(SVOp, DAG);
-    if (NewISDNode)
+    SDValue NewISDNode;
+    if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))
+      return NewISDNode;
+
+    if ((NewISDNode = lowerToVINSERTB(SVOp, DAG)))
       return NewISDNode;
   }
 
@@ -8675,6 +8786,23 @@ SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
+// Lower scalar BSWAP64 to xxbrd.
+SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  // MTVSRDD
+  Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0),
+                   Op.getOperand(0));
+  // XXBRD
+  Op = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Op);
+  // MFVSRD
+  int VectorIndex = 0;
+  if (Subtarget.isLittleEndian())
+    VectorIndex = 1;
+  Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
+                   DAG.getTargetConstant(VectorIndex, dl, MVT::i32));
+  return Op;
+}
+
 SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -8719,11 +8847,29 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
                                                   SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
          "Should only be called for ISD::INSERT_VECTOR_ELT");
+
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   // We have legal lowering for constant indices but not for variable ones.
-  if (C)
-    return Op;
-  return SDValue();
+  if (!C)
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  // We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
+  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
+    SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2);
+    unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;
+    unsigned InsertAtElement = C->getZExtValue();
+    unsigned InsertAtByte = InsertAtElement * BytesInEachElement;
+    if (Subtarget.isLittleEndian()) {
+      InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;
+    }
+    return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz,
+                       DAG.getConstant(InsertAtByte, dl, MVT::i32));
+  }
+  return Op;
 }
 
 SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
@@ -9146,6 +9292,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SREM:
   case ISD::UREM:
     return LowerREM(Op, DAG);
+  case ISD::BSWAP:
+    return LowerBSWAP(Op, DAG);
   }
 }
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 1a5efeba4cf..bf9c4b8e63b 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -953,6 +953,7 @@ namespace llvm {
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerREM(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
@@ -1079,6 +1080,11 @@ namespace llvm {
     /// from one vector into the other.
     SDValue lowerToVINSERTH(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;
 
+    /// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be
+    /// handled by the VINSERTB instruction introduced in ISA 3.0. This is
+    /// essentially v16i8 vector version of VINSERTH.
+    SDValue lowerToVINSERTB(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;
+
   }; // end class PPCTargetLowering
 
   namespace PPC {
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 506fac7dfc1..e751c149b0b 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -1312,7 +1312,12 @@ def VEXTUWLX : VX1_RT5_RA5_VB5<1677, "vextuwlx", []>;
 def VEXTUWRX : VX1_RT5_RA5_VB5<1933, "vextuwrx", []>;
 
 // Vector Insert Element Instructions
-def VINSERTB : VX1_VT5_UIM5_VB5<781, "vinsertb", []>;
+def VINSERTB : VXForm_1<781, (outs vrrc:$vD),
+                        (ins vrrc:$vDi, u4imm:$UIM, vrrc:$vB),
+                        "vinsertb $vD, $vB, $UIM", IIC_VecGeneral,
+                        [(set v16i8:$vD, (PPCvecinsert v16i8:$vDi, v16i8:$vB,
+                                                      imm32SExt16:$UIM))]>,
+                        RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
 def VINSERTH : VXForm_1<845, (outs vrrc:$vD),
                         (ins vrrc:$vDi, u4imm:$UIM, vrrc:$vB),
                         "vinserth $vD, $vB, $UIM", IIC_VecGeneral,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index ab86a54f6fe..565392f76e4 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -16,7 +16,7 @@
 
 #include "PPC.h"
 #include "PPCRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "PPCGenInstrInfo.inc"
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 1fc50d2c860..3261bc9bc53 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -2595,6 +2595,13 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
            (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>;
   }
 
+  // Alternate patterns for PPCmtvsrz where the output is v8i16 or v16i8 instead
+  // of f64
+  def : Pat<(v8i16 (PPCmtvsrz i32:$A)),
+            (v8i16 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>;
+  def : Pat<(v16i8 (PPCmtvsrz i32:$A)),
+            (v16i8 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>;
+
   // Patterns for which instructions from ISA 3.0 are a better match
   let Predicates = [IsLittleEndian, HasP9Vector] in {
   def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index d46c1383297..78467e81795 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -28,6 +28,8 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -37,8 +39,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <cstdlib>
diff --git a/lib/Target/RISCV/RISCV.h b/lib/Target/RISCV/RISCV.h
index 1b6140203c8..884cb2e5014 100644
--- a/lib/Target/RISCV/RISCV.h
+++ b/lib/Target/RISCV/RISCV.h
@@ -15,15 +15,21 @@
 #ifndef LLVM_LIB_TARGET_RISCV_RISCV_H
 #define LLVM_LIB_TARGET_RISCV_RISCV_H
 
-#include "MCTargetDesc/RISCVMCTargetDesc.h"
-#include "llvm/Target/TargetMachine.h"
+#include "MCTargetDesc/RISCVBaseInfo.h"
 
 namespace llvm {
 class RISCVTargetMachine;
+class AsmPrinter;
+class FunctionPass;
 class MCInst;
+class MCOperand;
 class MachineInstr;
+class MachineOperand;
 
-void LowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI);
+void LowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                    const AsmPrinter &AP);
+bool LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
+                                         MCOperand &MCOp, const AsmPrinter &AP);
 
 FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM);
 }
diff --git a/lib/Target/RISCV/RISCV.td b/lib/Target/RISCV/RISCV.td
index 54aa570e13b..da919acad36 100644
--- a/lib/Target/RISCV/RISCV.td
+++ b/lib/Target/RISCV/RISCV.td
@@ -40,9 +40,7 @@ def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit]>;
 //===----------------------------------------------------------------------===//
 
 def RISCVInstrInfo : InstrInfo {
-  // TODO: disable guessInstructionProperties when
-  // https://reviews.llvm.org/D37065 lands.
-  let guessInstructionProperties = 1;
+  let guessInstructionProperties = 0;
 }
 
 def RISCVAsmParser : AsmParser {
diff --git a/lib/Target/RISCV/RISCVAsmPrinter.cpp b/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 1c213b6c7e9..4808e6c73c5 100644
--- a/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -43,6 +43,11 @@ public:
 
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
                                    const MachineInstr *MI);
+
+  // Wrapper needed for tblgenned pseudo lowering.
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
+    return LowerRISCVMachineOperandToMCOperand(MO, MCOp, *this);
+  }
 };
 }
 
@@ -56,7 +61,7 @@ void RISCVAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
 
   MCInst TmpInst;
-  LowerRISCVMachineInstrToMCInst(MI, TmpInst);
+  LowerRISCVMachineInstrToMCInst(MI, TmpInst, *this);
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
diff --git a/lib/Target/RISCV/RISCVCallingConv.td b/lib/Target/RISCV/RISCVCallingConv.td
index e0c25e32e01..0b7a523424c 100644
--- a/lib/Target/RISCV/RISCVCallingConv.td
+++ b/lib/Target/RISCV/RISCVCallingConv.td
@@ -27,3 +27,6 @@ def CC_RISCV32 : CallingConv<[
 ]>;
 
 def CSR : CalleeSavedRegs<(add X1, X3, X4, X8, X9, (sequence "X%u", 18, 27))>;
+
+// Needed for implementation of RISCVRegisterInfo::getNoPreservedMask()
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
diff --git a/lib/Target/RISCV/RISCVFrameLowering.h b/lib/Target/RISCV/RISCVFrameLowering.h
index 14772ddac4a..0b2c7a40298 100644
--- a/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/lib/Target/RISCV/RISCVFrameLowering.h
@@ -14,7 +14,7 @@
 #ifndef LLVM_LIB_TARGET_RISCV_RISCVFRAMELOWERING_H
 #define LLVM_LIB_TARGET_RISCV_RISCVFRAMELOWERING_H
 
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
 class RISCVSubtarget;
@@ -30,6 +30,12 @@ public:
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   bool hasFP(const MachineFunction &MF) const override;
+
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override {
+    return MBB.erase(MI);
+  }
 };
 }
 #endif
diff --git a/lib/Target/RISCV/RISCVISelLowering.cpp b/lib/Target/RISCV/RISCVISelLowering.cpp
index d76170b7b78..98f7aa16e2e 100644
--- a/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -49,8 +49,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   setStackPointerRegisterToSaveRestore(RISCV::X2);
 
+  for (auto N : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD})
+    setLoadExtAction(N, XLenVT, MVT::i1, Promote);
+
   // TODO: add all necessary setOperationAction calls.
+  setOperationAction(ISD::GlobalAddress, XLenVT, Custom);
 
+  setOperationAction(ISD::BR_CC, XLenVT, Expand);
   setBooleanContents(ZeroOrOneBooleanContent);
 
   // Function alignments (log2).
@@ -63,6 +68,30 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   switch (Op.getOpcode()) {
   default:
     report_fatal_error("unimplemented operand");
+  case ISD::GlobalAddress:
+    return lowerGlobalAddress(Op, DAG);
+  }
+}
+
+SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT Ty = Op.getValueType();
+  GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = N->getGlobal();
+  int64_t Offset = N->getOffset();
+
+  if (!isPositionIndependent() && !Subtarget.is64Bit()) {
+    SDValue GAHi =
+        DAG.getTargetGlobalAddress(GV, DL, Ty, Offset, RISCVII::MO_HI);
+    SDValue GALo =
+        DAG.getTargetGlobalAddress(GV, DL, Ty, Offset, RISCVII::MO_LO);
+    SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, GAHi), 0);
+    SDValue MNLo =
+        SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, GALo), 0);
+    return MNLo;
+  } else {
+    report_fatal_error("Unable to lowerGlobalAddress");
   }
 }
 
@@ -79,6 +108,7 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
   default:
     report_fatal_error("Unsupported calling convention");
   case CallingConv::C:
+  case CallingConv::Fast:
     break;
   }
 
@@ -115,6 +145,135 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
   return Chain;
 }
 
+// Lower a call to a callseq_start + CALL + callseq_end chain, and add input
+// and output parameter nodes.
+SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                       SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc &DL = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  CLI.IsTailCall = false;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  if (IsVarArg) {
+    report_fatal_error("LowerCall with varargs not implemented");
+  }
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Analyze the operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+  ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV32);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = ArgCCInfo.getNextStackOffset();
+
+  for (auto &Arg : Outs) {
+    if (!Arg.Flags.isByVal())
+      continue;
+    report_fatal_error("Passing arguments byval not yet implemented");
+  }
+
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
+
+  // Copy argument values to their designated locations.
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SDValue StackPtr;
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    SDValue ArgValue = OutVals[I];
+
+    // Promote the value if needed.
+    // For now, only handle fully promoted arguments.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    default:
+      llvm_unreachable("Unknown loc info!");
+    }
+
+    if (VA.isRegLoc()) {
+      // Queue up the argument copies and emit them at the end.
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
+    } else {
+      assert(VA.isMemLoc() && "Argument not register or memory");
+      report_fatal_error("Passing arguments via the stack not yet implemented");
+    }
+  }
+
+  SDValue Glue;
+
+  // Build a sequence of copy-to-reg nodes, chained and glued together.
+  for (auto &Reg : RegsToPass) {
+    Chain = DAG.getCopyToReg(Chain, DL, Reg.first, Reg.second, Glue);
+    Glue = Chain.getValue(1);
+  }
+
+  if (isa<GlobalAddressSDNode>(Callee)) {
+    Callee = lowerGlobalAddress(Callee, DAG);
+  } else if (isa<ExternalSymbolSDNode>(Callee)) {
+    report_fatal_error(
+        "lowerExternalSymbol, needed for lowerCall, not yet handled");
+  }
+
+  // The first call operand is the chain and the second is the target address.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (auto &Reg : RegsToPass)
+    Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  // Glue the call to the argument copies, if any.
+  if (Glue.getNode())
+    Ops.push_back(Glue);
+
+  // Emit the call.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(RISCVISD::CALL, DL, NodeTys, Ops);
+  Glue = Chain.getValue(1);
+
+  // Mark the end of the call, which is glued to the call itself.
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getConstant(NumBytes, DL, PtrVT, true),
+                             DAG.getConstant(0, DL, PtrVT, true),
+                             Glue, DL);
+  Glue = Chain.getValue(1);
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState RetCCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
+  RetCCInfo.AnalyzeCallResult(Ins, RetCC_RISCV32);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (auto &VA : RVLocs) {
+    // Copy the value out, gluing the copy to the end of the call sequence.
+    SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
+                                          VA.getLocVT(), Glue);
+    Chain = RetValue.getValue(1);
+    Glue = RetValue.getValue(2);
+
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
 SDValue
 RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                  bool IsVarArg,
@@ -165,6 +324,8 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
     break;
   case RISCVISD::RET_FLAG:
     return "RISCVISD::RET_FLAG";
+  case RISCVISD::CALL:
+    return "RISCVISD::CALL";
   }
   return nullptr;
 }
diff --git a/lib/Target/RISCV/RISCVISelLowering.h b/lib/Target/RISCV/RISCVISelLowering.h
index 9fed48fc04e..dfb4824cc18 100644
--- a/lib/Target/RISCV/RISCVISelLowering.h
+++ b/lib/Target/RISCV/RISCVISelLowering.h
@@ -24,7 +24,8 @@ class RISCVSubtarget;
 namespace RISCVISD {
 enum NodeType : unsigned {
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
-  RET_FLAG
+  RET_FLAG,
+  CALL
 };
 }
 
@@ -52,10 +53,13 @@ private:
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
+  SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                          Type *Ty) const override {
     return true;
   }
+  SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
 };
 }
 
diff --git a/lib/Target/RISCV/RISCVInstrInfo.cpp b/lib/Target/RISCV/RISCVInstrInfo.cpp
index 92db5358ce4..5b4f4fcbb88 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -28,4 +28,50 @@
 
 using namespace llvm;
 
-RISCVInstrInfo::RISCVInstrInfo() : RISCVGenInstrInfo() {}
+RISCVInstrInfo::RISCVInstrInfo()
+    : RISCVGenInstrInfo(RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP) {}
+
+void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 const DebugLoc &DL, unsigned DstReg,
+                                 unsigned SrcReg, bool KillSrc) const {
+  assert(RISCV::GPRRegClass.contains(DstReg, SrcReg) &&
+         "Impossible reg-to-reg copy");
+
+  BuildMI(MBB, MBBI, DL, get(RISCV::ADDI), DstReg)
+      .addReg(SrcReg, getKillRegState(KillSrc))
+      .addImm(0);
+}
+
+void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned SrcReg, bool IsKill, int FI,
+                                         const TargetRegisterClass *RC,
+                                         const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end())
+    DL = I->getDebugLoc();
+
+  if (RC == &RISCV::GPRRegClass)
+    BuildMI(MBB, I, DL, get(RISCV::SW))
+        .addReg(SrcReg, getKillRegState(IsKill))
+        .addFrameIndex(FI)
+        .addImm(0);
+  else
+    llvm_unreachable("Can't store this register to stack slot");
+}
+
+void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          unsigned DstReg, int FI,
+                                          const TargetRegisterClass *RC,
+                                          const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end())
+    DL = I->getDebugLoc();
+
+  if (RC == &RISCV::GPRRegClass)
+    BuildMI(MBB, I, DL, get(RISCV::LW), DstReg).addFrameIndex(FI).addImm(0);
+  else
+    llvm_unreachable("Can't load this register from stack slot");
+}
diff --git a/lib/Target/RISCV/RISCVInstrInfo.h b/lib/Target/RISCV/RISCVInstrInfo.h
index 50404d5554d..05c8378445c 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/lib/Target/RISCV/RISCVInstrInfo.h
@@ -15,7 +15,7 @@
 #define LLVM_LIB_TARGET_RISCV_RISCVINSTRINFO_H
 
 #include "RISCVRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "RISCVGenInstrInfo.inc"
@@ -26,7 +26,21 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
 
 public:
   RISCVInstrInfo();
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                   const DebugLoc &DL, unsigned DstReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+                           bool IsKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI, unsigned DstReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
 };
 }
-
 #endif
diff --git a/lib/Target/RISCV/RISCVInstrInfo.td b/lib/Target/RISCV/RISCVInstrInfo.td
index 23adf1eda9d..23f218fda8f 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/lib/Target/RISCV/RISCVInstrInfo.td
@@ -17,8 +17,22 @@ include "RISCVInstrFormats.td"
 // RISC-V specific DAG Nodes.
 //===----------------------------------------------------------------------===//
 
-def RetFlag : SDNode<"RISCVISD::RET_FLAG", SDTNone,
-                     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def SDT_RISCVCall         : SDTypeProfile<0, -1, [SDTCisVT<0, XLenVT>]>;
+def SDT_RISCVCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+                                            SDTCisVT<1, i32>]>;
+def SDT_RISCVCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
+                                          SDTCisVT<1, i32>]>;
+
+
+def Call         : SDNode<"RISCVISD::CALL", SDT_RISCVCall,
+                          [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                           SDNPVariadic]>;
+def CallSeqStart : SDNode<"ISD::CALLSEQ_START", SDT_RISCVCallSeqStart,
+                          [SDNPHasChain, SDNPOutGlue]>;
+def CallSeqEnd   : SDNode<"ISD::CALLSEQ_END", SDT_RISCVCallSeqEnd,
+                          [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def RetFlag      : SDNode<"RISCVISD::RET_FLAG", SDTNone,
+                          [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
 //===----------------------------------------------------------------------===//
 // Operand and SDNode transformation definitions.
@@ -67,7 +81,7 @@ def uimm12 : Operand<XLenVT> {
 }
 
 // A 13-bit signed immediate where the least significant bit is zero.
-def simm13_lsb0 : Operand<XLenVT> {
+def simm13_lsb0 : Operand<OtherVT> {
   let ParserMatchClass = SImmAsmOperand<13, "Lsb0">;
   let EncoderMethod = "getImmOpValueAsr1";
   let DecoderMethod = "decodeSImmOperandAndLsl1<13>";
@@ -80,12 +94,30 @@ def uimm20 : Operand<XLenVT> {
 }
 
 // A 21-bit signed immediate where the least significant bit is zero.
-def simm21_lsb0 : Operand<XLenVT> {
+def simm21_lsb0 : Operand<OtherVT> {
   let ParserMatchClass = SImmAsmOperand<21, "Lsb0">;
   let EncoderMethod = "getImmOpValueAsr1";
   let DecoderMethod = "decodeSImmOperandAndLsl1<21>";
 }
 
+// Standalone (codegen-only) immleaf patterns.
+def simm32 : ImmLeaf<XLenVT, [{return isInt<32>(Imm);}]>;
+
+// Extract least significant 12 bits from an immediate value and sign extend
+// them.
+def LO12Sext : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(SignExtend64<12>(N->getZExtValue()),
+                                   SDLoc(N), N->getValueType(0));
+}]>;
+
+// Extract the most significant 20 bits from an immediate value. Add 1 if bit
+// 11 is 1, to compensate for the low 12 bits in the matching immediate addi
+// or ld/st being negative.
+def HI20 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(((N->getZExtValue()+0x800) >> 12) & 0xfffff,
+                                   SDLoc(N), N->getValueType(0));
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction Class Templates
 //===----------------------------------------------------------------------===//
@@ -257,6 +289,12 @@ class PatGprUimm5<SDPatternOperator OpNode, RVInstIShift Inst>
     : Pat<(OpNode GPR:$rs1, uimm5:$shamt),
           (Inst GPR:$rs1, uimm5:$shamt)>;
 
+/// Immediates
+
+def : Pat<(simm12:$imm), (ADDI X0, simm12:$imm)>;
+// TODO: Add a pattern for immediates with all zeroes in the lower 12 bits.
+def : Pat<(simm32:$imm), (ADDI (LUI (HI20 imm:$imm)), (LO12Sext imm:$imm))>;
+
 /// Simple arithmetic operations
 
 def : PatGprGpr<add, ADD>;
@@ -284,6 +322,80 @@ def : PatGprSimm12<setult, SLTIU>;
 
 /// Branches and jumps
 
+// Match `(brcond (CondOp ..), ..)` and lower to the appropriate RISC-V branch
+// instruction.
+class BccPat<PatFrag CondOp, RVInstB Inst>
+    : Pat<(brcond (i32 (CondOp GPR:$rs1, GPR:$rs2)), bb:$imm12),
+          (Inst GPR:$rs1, GPR:$rs2, simm13_lsb0:$imm12)>;
+
+def : BccPat<seteq, BEQ>;
+def : BccPat<setne, BNE>;
+def : BccPat<setlt, BLT>;
+def : BccPat<setge, BGE>;
+def : BccPat<setult, BLTU>;
+def : BccPat<setuge, BGEU>;
+
+class BccSwapPat<PatFrag CondOp, RVInst InstBcc>
+    : Pat<(brcond (i32 (CondOp GPR:$rs1, GPR:$rs2)), bb:$imm12),
+          (InstBcc GPR:$rs2, GPR:$rs1, bb:$imm12)>;
+
+// Condition codes that don't have matching RISC-V branch instructions, but
+// are trivially supported by swapping the two input operands
+def : BccSwapPat<setgt, BLT>;
+def : BccSwapPat<setle, BGE>;
+def : BccSwapPat<setugt, BLTU>;
+def : BccSwapPat<setule, BGEU>;
+
+// An extra pattern is needed for a brcond without a setcc (i.e. where the
+// condition was calculated elsewhere).
+def : Pat<(brcond GPR:$cond, bb:$imm12), (BNE GPR:$cond, X0, bb:$imm12)>;
+
+let isBarrier = 1, isBranch = 1, isTerminator = 1 in
+def PseudoBR : Pseudo<(outs), (ins simm21_lsb0:$imm20), [(br bb:$imm20)]>,
+               PseudoInstExpansion<(JAL X0, simm21_lsb0:$imm20)>;
+
+let isCall = 1, Defs=[X1] in
+def PseudoCALL : Pseudo<(outs), (ins GPR:$rs1), [(Call GPR:$rs1)]>,
+                 PseudoInstExpansion<(JALR X1, GPR:$rs1, 0)>;
+
 let isBarrier = 1, isReturn = 1, isTerminator = 1 in
 def PseudoRET : Pseudo<(outs), (ins), [(RetFlag)]>,
                 PseudoInstExpansion<(JALR X0, X1, 0)>;
+
+/// Loads
+
+multiclass LdPat<PatFrag LoadOp, RVInst Inst> {
+  def : Pat<(LoadOp GPR:$rs1), (Inst GPR:$rs1, 0)>;
+  def : Pat<(LoadOp (add GPR:$rs1, simm12:$imm12)),
+            (Inst GPR:$rs1, simm12:$imm12)>;
+}
+
+defm : LdPat<sextloadi8, LB>;
+defm : LdPat<extloadi8, LB>;
+defm : LdPat<sextloadi16, LH>;
+defm : LdPat<extloadi16, LH>;
+defm : LdPat<load, LW>;
+defm : LdPat<zextloadi8, LBU>;
+defm : LdPat<zextloadi16, LHU>;
+
+/// Stores
+
+multiclass StPat<PatFrag StoreOp, RVInst Inst> {
+  def : Pat<(StoreOp GPR:$rs2, GPR:$rs1), (Inst GPR:$rs2, GPR:$rs1, 0)>;
+  def : Pat<(StoreOp GPR:$rs2, (add GPR:$rs1, simm12:$imm12)),
+            (Inst GPR:$rs2, GPR:$rs1, simm12:$imm12)>;
+}
+
+defm : StPat<truncstorei8, SB>;
+defm : StPat<truncstorei16, SH>;
+defm : StPat<store, SW>;
+
+/// Other pseudo-instructions
+
+// Pessimistically assume the stack pointer will be clobbered
+let Defs = [X2], Uses = [X2] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                              [(CallSeqStart timm:$amt1, timm:$amt2)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                              [(CallSeqEnd timm:$amt1, timm:$amt2)]>;
+} // Defs = [X2], Uses = [X2]
diff --git a/lib/Target/RISCV/RISCVMCInstLower.cpp b/lib/Target/RISCV/RISCVMCInstLower.cpp
index 1ac8d982ff9..ef0051ed56e 100644
--- a/lib/Target/RISCV/RISCVMCInstLower.cpp
+++ b/lib/Target/RISCV/RISCVMCInstLower.cpp
@@ -13,6 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCV.h"
+#include "MCTargetDesc/RISCVMCExpr.h"
+#include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -24,27 +26,72 @@
 
 using namespace llvm;
 
-void llvm::LowerRISCVMachineInstrToMCInst(const MachineInstr *MI,
-                                          MCInst &OutMI) {
+static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
+                                    const AsmPrinter &AP) {
+  MCContext &Ctx = AP.OutContext;
+  RISCVMCExpr::VariantKind Kind;
+
+  switch (MO.getTargetFlags()) {
+  default:
+    llvm_unreachable("Unknown target flag on GV operand");
+  case RISCVII::MO_None:
+    Kind = RISCVMCExpr::VK_RISCV_None;
+    break;
+  case RISCVII::MO_LO:
+    Kind = RISCVMCExpr::VK_RISCV_LO;
+    break;
+  case RISCVII::MO_HI:
+    Kind = RISCVMCExpr::VK_RISCV_HI;
+    break;
+  }
+
+  const MCExpr *ME =
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx);
+
+  if (!MO.isJTI() && MO.getOffset())
+    ME = MCBinaryExpr::createAdd(
+        ME, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+
+  ME = RISCVMCExpr::create(ME, Kind, Ctx);
+  return MCOperand::createExpr(ME);
+}
+
+bool llvm::LowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
+                                               MCOperand &MCOp,
+                                               const AsmPrinter &AP) {
+  switch (MO.getType()) {
+  default:
+    report_fatal_error("LowerRISCVMachineInstrToMCInst: unknown operand type");
+  case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
+    if (MO.isImplicit())
+      return false;
+    MCOp = MCOperand::createReg(MO.getReg());
+    break;
+  case MachineOperand::MO_RegisterMask:
+    // Regmasks are like implicit defs.
+    return false;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::createImm(MO.getImm());
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::createExpr(
+        MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), AP.OutContext));
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    MCOp = lowerSymbolOperand(MO, AP.getSymbol(MO.getGlobal()), AP);
+    break;
+  }
+  return true;
+}
+
+void llvm::LowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                          const AsmPrinter &AP) {
   OutMI.setOpcode(MI->getOpcode());
 
   for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCOp;
-    switch (MO.getType()) {
-    default:
-      report_fatal_error(
-          "LowerRISCVMachineInstrToMCInst: unknown operand type");
-    case MachineOperand::MO_Register:
-      // Ignore all implicit register operands.
-      if (MO.isImplicit())
-        continue;
-      MCOp = MCOperand::createReg(MO.getReg());
-      break;
-    case MachineOperand::MO_Immediate:
-      MCOp = MCOperand::createImm(MO.getImm());
-      break;
-    }
-
-    OutMI.addOperand(MCOp);
+    if (LowerRISCVMachineOperandToMCOperand(MO, MCOp, AP))
+      OutMI.addOperand(MCOp);
   }
 }
diff --git a/lib/Target/RISCV/RISCVRegisterInfo.cpp b/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 4f6c528061c..cd658d7e2d9 100644
--- a/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -18,9 +18,9 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 #define GET_REGINFO_TARGET_DESC
 #include "RISCVGenRegisterInfo.inc"
@@ -50,12 +50,47 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
+const uint32_t *RISCVRegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
+}
+
 void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                             int SPAdj, unsigned FIOperandNum,
                                             RegScavenger *RS) const {
-  report_fatal_error("Subroutines not supported yet");
+  // TODO: this implementation is a temporary placeholder which does just
+  // enough to allow other aspects of code generation to be tested
+
+  assert(SPAdj == 0 && "Unexpected non-zero SPAdj value");
+
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned FrameReg = getFrameRegister(MF);
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  int Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg);
+  Offset += MI.getOperand(FIOperandNum + 1).getImm();
+
+  assert(TFI->hasFP(MF) && "eliminateFrameIndex currently requires hasFP");
+
+  // Offsets must be directly encoded in a 12-bit immediate field
+  if (!isInt<12>(Offset)) {
+    report_fatal_error(
+        "Frame offsets outside of the signed 12-bit range not supported");
+  }
+
+  MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false);
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+  return;
 }
 
 unsigned RISCVRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return RISCV::X8;
 }
+
+const uint32_t *
+RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & /*MF*/,
+                                        CallingConv::ID /*CC*/) const {
+  return CSR_RegMask;
+}
diff --git a/lib/Target/RISCV/RISCVRegisterInfo.h b/lib/Target/RISCV/RISCVRegisterInfo.h
index 94af9f44ecd..d9de9bf8c76 100644
--- a/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -25,10 +25,15 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
 
   RISCVRegisterInfo(unsigned HwMode);
 
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID) const override;
+
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
+  const uint32_t *getNoPreservedMask() const override;
+
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index df819ccd15d..6948b72747c 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -19,8 +19,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index ac0e69ccde1..6098afa6898 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h
@@ -15,7 +15,7 @@
 #define LLVM_LIB_TARGET_SPARC_SPARCFRAMELOWERING_H
 
 #include "Sparc.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
 
diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
index c053cc4c475..524b5d05416 100644
--- a/lib/Target/Sparc/SparcInstrInfo.h
+++ b/lib/Target/Sparc/SparcInstrInfo.h
@@ -15,7 +15,7 @@
 #define LLVM_LIB_TARGET_SPARC_SPARCINSTRINFO_H
 
 #include "SparcRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "SparcGenInstrInfo.inc"
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index 37a1fdf4d77..b9647eaa3d5 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -20,10 +20,10 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index bfbdb8d0b44..ad6b55a9fc9 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -19,7 +19,7 @@
 #include "SparcInstrInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h
index 91c5a5d53a1..a75d111b029 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -11,7 +11,7 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H
 
 #include "llvm/ADT/IndexedMap.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
 class SystemZTargetMachine;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 4533f4fdf21..19ce7776fed 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -27,12 +27,12 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <cassert>
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index b8be1f5f392..216139eb7c7 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -20,7 +20,7 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include <cstdint>
 
 #define GET_INSTRINFO_HEADER
diff --git a/lib/Target/SystemZ/SystemZLDCleanup.cpp b/lib/Target/SystemZ/SystemZLDCleanup.cpp
index d4cd89ce590..27c72b41e4f 100644
--- a/lib/Target/SystemZ/SystemZLDCleanup.cpp
+++ b/lib/Target/SystemZ/SystemZLDCleanup.cpp
@@ -19,7 +19,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 05f93ce5162..a44fae523fe 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -13,7 +13,7 @@
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index a4d9421e08a..6d50369e587 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -323,6 +323,11 @@ unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) const {
   return 0;
 }
 
+bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
+  EVT VT = TLI->getValueType(DL, DataType);
+  return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
+}
+
 int SystemZTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty,  
     TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 28821a2ca11..4b11a6f0a83 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -62,6 +62,7 @@ public:
   unsigned getPrefetchDistance() { return 2000; }
   unsigned getMinPrefetchStride() { return 2048; }
 
+  bool hasDivRemOp(Type *DataType, bool IsSigned);
   bool prefersVectorizedAddressing() { return false; }
   bool LSRWithInstrQueries() { return true; }
   bool supportsEfficientVectorElementLoadStore() { return true; }
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 5dcb89477a3..b24888fa9cb 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -167,6 +167,13 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
     if (GV && !GV->isDeclarationForLinker())
       return true;
 
+    // A symbol marked nonlazybind should not be accessed with a plt. If the
+    // symbol turns out to be external, the linker will convert a direct
+    // access to an access via the plt, so don't assume it is local.
+    const Function *F = dyn_cast_or_null<Function>(GV);
+    if (F && F->hasFnAttribute(Attribute::NonLazyBind))
+      return false;
+
     bool IsTLS = GV && GV->isThreadLocal();
     bool IsAccessViaCopyRelocs =
         Options.MCOptions.MCPIECopyRelocations && GV && isa<GlobalVariable>(GV);
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 211358ad66c..ee60c8f3a7a 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -267,12 +267,11 @@ bool WebAssemblyAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   if (AsmVariant != 0)
     report_fatal_error("There are no defined alternate asm variants");
 
-  if (!ExtraCode) {
-    // TODO: For now, we just hard-code 0 as the constant offset; teach
-    // SelectInlineAsmMemoryOperand how to do address mode matching.
-    OS << "0(" + regToString(MI->getOperand(OpNo)) + ')';
-    return false;
-  }
+  // The current approach to inline asm is that "r" constraints are expressed
+  // as local indices, rather than values on the operand stack. This simplifies
+  // using "r" as it eliminates the need to push and pop the values in a
+  // particular order, however it also makes it impossible to have an "m"
+  // constraint. So we don't support it.
 
   return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, AsmVariant, ExtraCode, OS);
 }
diff --git a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 41249117ae0..e2edb924d4d 100644
--- a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -294,6 +294,17 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
 
         unsigned OldReg = MO.getReg();
 
+        // Inline asm may have a def in the middle of the operands. Our contract
+        // with inline asm register operands is to provide local indices as
+        // immediates.
+        if (MO.isDef()) {
+          assert(MI.getOpcode() == TargetOpcode::INLINEASM);
+          unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+          MRI.removeRegOperandFromUseList(&MO);
+          MO = MachineOperand::CreateImm(LocalId);
+          continue;
+        }
+
         // If we see a stackified register, prepare to insert subsequent
         // get_locals before the start of its tree.
         if (MFI.isVRegStackified(OldReg)) {
@@ -301,6 +312,15 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
           continue;
         }
 
+        // Our contract with inline asm register operands is to provide local
+        // indices as immediates.
+        if (MI.getOpcode() == TargetOpcode::INLINEASM) {
+          unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+          MRI.removeRegOperandFromUseList(&MO);
+          MO = MachineOperand::CreateImm(LocalId);
+          continue;
+        }
+
         // Insert a get_local.
         unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
         const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index f516a6b260d..e67b1c88b58 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -541,7 +541,7 @@ unsigned WebAssemblyFastISel::getRegForUnsignedValue(const Value *V) {
 unsigned WebAssemblyFastISel::getRegForSignedValue(const Value *V) {
   MVT::SimpleValueType From = getSimpleType(V->getType());
   MVT::SimpleValueType To = getLegalType(From);
-  return zeroExtend(getRegForValue(V), V, From, To);
+  return signExtend(getRegForValue(V), V, From, To);
 }
 
 unsigned WebAssemblyFastISel::getRegForPromotedValue(const Value *V,
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
index bf326fce88f..4cc7f5ae058 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
@@ -16,7 +16,7 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYFRAMELOWERING_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYFRAMELOWERING_H
 
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
 class MachineFrameInfo;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
index df6c937a364..eb74106336e 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -17,7 +17,7 @@
 #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYINSTRINFO_H
 
 #include "WebAssemblyRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "WebAssemblyGenInstrInfo.inc"
diff --git a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index 9367464c806..5e7ebd19fac 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -24,7 +24,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index b8ea2f01133..2a784c9822a 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -2791,7 +2791,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
                            isParsingIntelSyntax())) {
   default: llvm_unreachable("Unexpected match result!");
   case Match_Success:
-    if (validateInstruction(Inst, Operands))
+    if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
       return true;
     // Some instructions need post-processing to, for example, tweak which
     // encoding is selected. Loop on it while changes happen so the
@@ -3082,7 +3082,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
   // instruction will already have been filled in correctly, since the failing
   // matches won't have modified it).
   if (NumSuccessfulMatches == 1) {
-    if (validateInstruction(Inst, Operands))
+    if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
       return true;
     // Some instructions need post-processing to, for example, tweak which
     // encoding is selected. Loop on it while changes happen so the individual
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index 6ff1136cd85..0c99dbbe328 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -54,12 +54,12 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   if (TSFlags & X86II::LOCK)
     OS << "\tlock\t";
   if (!(TSFlags & X86II::LOCK) && Flags & X86::IP_HAS_LOCK)
-    OS << "\tlock\n";
+    OS << "\tlock\t";
 
   if (Flags & X86::IP_HAS_REPEAT_NE)
-    OS << "\trepne\n";
+    OS << "\trepne\t";
   else if (Flags & X86::IP_HAS_REPEAT)
-    OS << "\trep\n";
+    OS << "\trep\t";
 
   // Output CALLpcrel32 as "callq" in 64-bit mode.
   // In Intel annotation it's always emitted as "call".
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 464941a1bab..1f02600a798 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -41,13 +41,13 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   uint64_t TSFlags = Desc.TSFlags;
 
   if (TSFlags & X86II::LOCK)
-    OS << "\tlock\n";
+    OS << "\tlock\t";
 
   unsigned Flags = MI->getFlags();
   if (Flags & X86::IP_HAS_REPEAT_NE)
-    OS << "\trepne\n";
+    OS << "\trepne\t";
   else if (Flags & X86::IP_HAS_REPEAT)
-    OS << "\trep\n";
+    OS << "\trep\t";
 
   printInstruction(MI, OS);
 
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index f4021d7639b..34f2956e0c0 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -116,9 +116,15 @@ def FeatureAVX     : SubtargetFeature<"avx", "X86SSELevel", "AVX",
 def FeatureAVX2    : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
                                       "Enable AVX2 instructions",
                                       [FeatureAVX]>;
+def FeatureFMA     : SubtargetFeature<"fma", "HasFMA", "true",
+                                      "Enable three-operand fused multiple-add",
+                                      [FeatureAVX]>;
+def FeatureF16C    : SubtargetFeature<"f16c", "HasF16C", "true",
+                       "Support 16-bit floating point conversion instructions",
+                       [FeatureAVX]>;
 def FeatureAVX512   : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
                                       "Enable AVX-512 instructions",
-                                      [FeatureAVX2]>;
+                                      [FeatureAVX2, FeatureFMA, FeatureF16C]>;
 def FeatureERI      : SubtargetFeature<"avx512er", "HasERI", "true",
                       "Enable AVX-512 Exponential and Reciprocal Instructions",
                                       [FeatureAVX512]>;
@@ -154,9 +160,6 @@ def FeaturePKU   : SubtargetFeature<"pku", "HasPKU", "true",
 def FeaturePCLMUL  : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
                          "Enable packed carry-less multiplication instructions",
                                [FeatureSSE2]>;
-def FeatureFMA     : SubtargetFeature<"fma", "HasFMA", "true",
-                                      "Enable three-operand fused multiple-add",
-                                      [FeatureAVX]>;
 def FeatureFMA4    : SubtargetFeature<"fma4", "HasFMA4", "true",
                                       "Enable four-operand fused multiple-add",
                                       [FeatureAVX, FeatureSSE4A]>;
@@ -177,9 +180,6 @@ def FeatureMOVBE   : SubtargetFeature<"movbe", "HasMOVBE", "true",
                                       "Support MOVBE instruction">;
 def FeatureRDRAND  : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
                                       "Support RDRAND instruction">;
-def FeatureF16C    : SubtargetFeature<"f16c", "HasF16C", "true",
-                       "Support 16-bit floating point conversion instructions",
-                       [FeatureAVX]>;
 def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
                                        "Support FS/GS Base instructions">;
 def FeatureLZCNT   : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index 34e384ba311..d8b5b7d3fc0 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -34,13 +34,13 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <cassert>
 #include <cstddef>
diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp
index 7beb9c6e357..54f937bcda3 100644
--- a/lib/Target/X86/X86CallLowering.cpp
+++ b/lib/Target/X86/X86CallLowering.cpp
@@ -34,6 +34,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DataLayout.h"
@@ -41,7 +42,6 @@
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/LowLevelTypeImpl.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <cassert>
 #include <cstdint>
diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp
index b2cd622b1e8..e75276960cc 100644
--- a/lib/Target/X86/X86CmovConversion.cpp
+++ b/lib/Target/X86/X86CmovConversion.cpp
@@ -57,6 +57,7 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MCSchedule.h"
@@ -64,7 +65,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp
index 744510a3a3b..6dd4631a484 100644
--- a/lib/Target/X86/X86EvexToVex.cpp
+++ b/lib/Target/X86/X86EvexToVex.cpp
@@ -171,7 +171,7 @@ static void performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
   case X86::VALIGNDZ128rri:
   case X86::VALIGNDZ128rmi:
   case X86::VALIGNQZ128rri:
-  case X86::VALIGNQZ128rmi:
+  case X86::VALIGNQZ128rmi: {
     assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) &&
            "Unexpected new opcode!");
     unsigned Scale = (Opc == X86::VALIGNQZ128rri ||
@@ -180,6 +180,24 @@ static void performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
     Imm.setImm(Imm.getImm() * Scale);
     break;
   }
+  case X86::VSHUFF32X4Z256rmi:
+  case X86::VSHUFF32X4Z256rri:
+  case X86::VSHUFF64X2Z256rmi:
+  case X86::VSHUFF64X2Z256rri:
+  case X86::VSHUFI32X4Z256rmi:
+  case X86::VSHUFI32X4Z256rri:
+  case X86::VSHUFI64X2Z256rmi:
+  case X86::VSHUFI64X2Z256rri: {
+    assert((NewOpc == X86::VPERM2F128rr || NewOpc == X86::VPERM2I128rr ||
+            NewOpc == X86::VPERM2F128rm || NewOpc == X86::VPERM2I128rm) &&
+           "Unexpected new opcode!");
+    MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1);
+    int64_t ImmVal = Imm.getImm();
+    // Set bit 5, move bit 1 to bit 4, copy bit 0.
+    Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1));
+    break;
+  }
+  }
 }
 
 
diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp
index b2b5a78fcdb..9664c931c35 100644
--- a/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/lib/Target/X86/X86FixupBWInsts.cpp
@@ -55,9 +55,9 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 using namespace llvm;
 
 #define FIXUPBW_DESC "X86 Byte/Word Instruction Fixup"
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 9f649dad8bc..bbc2bffdb70 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -22,9 +22,9 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 using namespace llvm;
 
 namespace llvm {
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 5582526541b..7bcbe199124 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -37,11 +37,11 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 988f2967401..86e65b83ffa 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1562,6 +1562,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   bool HasFP = hasFP(MF);
   uint64_t NumBytes = 0;
 
+  bool NeedsDwarfCFI =
+      (!MF.getTarget().getTargetTriple().isOSDarwin() &&
+       !MF.getTarget().getTargetTriple().isOSWindows()) &&
+      (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry());
+
   if (IsFunclet) {
     assert(HasFP && "EH funclets without FP not yet implemented");
     NumBytes = getWinEHFuncletFrameSize(MF);
@@ -1584,6 +1589,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
             MachineFramePtr)
         .setMIFlag(MachineInstr::FrameDestroy);
+    if (NeedsDwarfCFI) {
+      unsigned DwarfStackPtr =
+          TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);
+      BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfa(
+                                  nullptr, DwarfStackPtr, -SlotSize));
+      --MBBI;
+    }
   }
 
   MachineBasicBlock::iterator FirstCSPop = MBBI;
@@ -1647,6 +1659,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   } else if (NumBytes) {
     // Adjust stack pointer back: ESP += numbytes.
     emitSPUpdate(MBB, MBBI, NumBytes, /*InEpilogue=*/true);
+    if (!hasFP(MF) && NeedsDwarfCFI) {
+      // Define the current CFA rule to use the provided offset.
+      BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
+                                  nullptr, -CSSize - SlotSize));
+    }
     --MBBI;
   }
 
@@ -1659,6 +1676,23 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   if (NeedsWin64CFI && MF.hasWinCFI())
     BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
 
+  if (!hasFP(MF) && NeedsDwarfCFI) {
+    MBBI = FirstCSPop;
+    int64_t Offset = -CSSize - SlotSize;
+    // Mark callee-saved pop instruction.
+    // Define the current CFA rule to use the provided offset.
+    while (MBBI != MBB.end()) {
+      MachineBasicBlock::iterator PI = MBBI;
+      unsigned Opc = PI->getOpcode();
+      ++MBBI;
+      if (Opc == X86::POP32r || Opc == X86::POP64r) {
+        Offset += SlotSize;
+        BuildCFI(MBB, MBBI, DL,
+                 MCCFIInstruction::createDefCfaOffset(nullptr, Offset));
+      }
+    }
+  }
+
   if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
     // Add the return addr area delta back since we are not tail calling.
     int Offset = -1 * X86FI->getTCReturnAddrDelta();
@@ -2577,6 +2611,7 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
   unsigned Regs[2];
   unsigned FoundRegs = 0;
 
+  auto &MRI = MBB.getParent()->getRegInfo();
   auto RegMask = Prev->getOperand(1);
 
   auto &RegClass =
@@ -2590,6 +2625,10 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
     if (!RegMask.clobbersPhysReg(Candidate))
       continue;
 
+    // Don't clobber reserved registers
+    if (MRI.isReserved(Candidate))
+      continue;
+
     bool IsDef = false;
     for (const MachineOperand &MO : Prev->implicit_operands()) {
       if (MO.isReg() && MO.isDef() &&
@@ -2835,6 +2874,15 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
   return MBBI;
 }
 
+int X86FrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
+  return TRI->getSlotSize();
+}
+
+unsigned X86FrameLowering::getInitialCFARegister(const MachineFunction &MF)
+    const {
+  return TRI->getDwarfRegNum(StackPtr, true);
+}
+
 namespace {
 // Struct used by orderFrameObjects to help sort the stack objects.
 struct X86FrameSortingObject {
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 38ac96e16d4..2bce79262d0 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -14,7 +14,7 @@
 #ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
 #define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
 
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
 
@@ -168,6 +168,10 @@ public:
                               MachineBasicBlock::iterator MBBI,
                               const DebugLoc &DL, bool RestoreSP = false) const;
 
+  int getInitialCFAOffset(const MachineFunction &MF) const override;
+
+  unsigned getInitialCFARegister(const MachineFunction &MF) const override;
+
 private:
   uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
 
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index e43fd508de3..0cbf7601790 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -449,15 +449,15 @@ namespace {
 // Returns true if this masked compare can be implemented legally with this
 // type.
 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
-  if (N->getOpcode() == X86ISD::PCMPEQM ||
-      N->getOpcode() == X86ISD::PCMPGTM ||
-      N->getOpcode() == X86ISD::CMPM ||
-      N->getOpcode() == X86ISD::CMPMU) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == X86ISD::PCMPEQM || Opcode == X86ISD::PCMPGTM ||
+      Opcode == X86ISD::CMPM || Opcode == X86ISD::TESTM ||
+      Opcode == X86ISD::TESTNM || Opcode == X86ISD::CMPMU) {
     // We can get 256-bit 8 element types here without VLX being enabled. When
     // this happens we will use 512-bit operations and the mask will not be
     // zero extended.
-    if (N->getOperand(0).getValueType() == MVT::v8i32 ||
-        N->getOperand(0).getValueType() == MVT::v8f32)
+    EVT OpVT = N->getOperand(0).getValueType();
+    if (OpVT == MVT::v8i32 || OpVT == MVT::v8f32)
       return Subtarget->hasVLX();
 
     return true;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b178ad6c13e..22b4d7997fa 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -380,8 +380,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // Special handling for half-precision floating point conversions.
   // If we don't have F16C support, then lower half float conversions
   // into library calls.
-  if (Subtarget.useSoftFloat() ||
-      (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
+  if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
   }
@@ -4998,6 +4997,8 @@ static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
   switch (Opcode) {
   default:
     return false;
+  case X86ISD::TESTM:
+  case X86ISD::TESTNM:
   case X86ISD::PCMPEQM:
   case X86ISD::PCMPGTM:
   case X86ISD::CMPM:
@@ -6746,6 +6747,9 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
          "Unsupported vector type for broadcast.");
 
+  BitVector UndefElements;
+  SDValue Ld = BVOp->getSplatValue(&UndefElements);
+
   // Attempt to use VBROADCASTM
   // From this paterrn:
   // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
@@ -6753,17 +6757,23 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
   //
   // Create (VBROADCASTM v2i1 X)
   if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
-    MVT EltType;
-    unsigned NumElts;
+    MVT EltType = VT.getScalarType();
+    unsigned NumElts = VT.getVectorNumElements();
+    SDValue BOperand;
     SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
-    if (ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) {
-      SDValue BOperand = ZeroExtended.getOperand(0);
+    if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
+        (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
+         Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
+      if (ZeroExtended)
+        BOperand = ZeroExtended.getOperand(0);
+      else
+        BOperand = Ld.getOperand(0).getOperand(0);
       if (BOperand.getValueType().isVector() &&
           BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
-        if ((EltType == MVT::i64 &&
-             VT.getVectorElementType() == MVT::i8) || // for broadcastmb2q
-            (EltType == MVT::i32 &&
-             VT.getVectorElementType() == MVT::i16)) { // for broadcastmw2d
+        if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 ||
+                                     NumElts == 8)) || // for broadcastmb2q
+            (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 ||
+                                     NumElts == 16))) { // for broadcastmw2d
           SDValue Brdcst =
               DAG.getNode(X86ISD::VBROADCASTM, dl,
                           MVT::getVectorVT(EltType, NumElts), BOperand);
@@ -6773,9 +6783,6 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
     }
   }
 
-  BitVector UndefElements;
-  SDValue Ld = BVOp->getSplatValue(&UndefElements);
-
   // We need a splat of a single value to use broadcast, and it doesn't
   // make any sense if the value is only in one element of the vector.
   if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
@@ -7707,6 +7714,111 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
+// reasoned to be a permutation of a vector by indices in a non-constant vector.
+// (build_vector (extract_elt V, (extract_elt I, 0)),
+//               (extract_elt V, (extract_elt I, 1)),
+//                    ...
+// ->
+// (vpermv I, V)
+//
+// TODO: Handle undefs
+// TODO: Utilize pshufb and zero mask blending to support more efficient
+// construction of vectors with constant-0 elements.
+// TODO: Use smaller-element vectors of same width, and "interpolate" the indices,
+// when no native operation available.
+static SDValue
+LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
+                                   const X86Subtarget &Subtarget) {
+  // Look for VPERMV and PSHUFB opportunities.
+  MVT VT = V.getSimpleValueType();
+  switch (VT.SimpleTy) {
+  default:
+    return SDValue();
+  case MVT::v16i8:
+    if (!Subtarget.hasSSE3())
+      return SDValue();
+    break;
+  case MVT::v8f32:
+  case MVT::v8i32:
+    if (!Subtarget.hasAVX2())
+      return SDValue();
+    break;
+  case MVT::v4i64:
+  case MVT::v4f64:
+    if (!Subtarget.hasVLX())
+      return SDValue();
+    break;
+  case MVT::v16f32:
+  case MVT::v8f64:
+  case MVT::v16i32:
+  case MVT::v8i64:
+    if (!Subtarget.hasAVX512())
+      return SDValue();
+    break;
+  case MVT::v32i16:
+    if (!Subtarget.hasBWI())
+      return SDValue();
+    break;
+  case MVT::v8i16:
+  case MVT::v16i16:
+    if (!Subtarget.hasVLX() || !Subtarget.hasBWI())
+      return SDValue();
+    break;
+  case MVT::v64i8:
+    if (!Subtarget.hasVBMI())
+      return SDValue();
+    break;
+  case MVT::v32i8:
+    if (!Subtarget.hasVLX() || !Subtarget.hasVBMI())
+      return SDValue();
+    break;
+  }
+  SDValue SrcVec, IndicesVec;
+  // Check for a match of the permute source vector and permute index elements.
+  // This is done by checking that the i-th build_vector operand is of the form:
+  // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
+  for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
+    SDValue Op = V.getOperand(Idx);
+    if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // If this is the first extract encountered in V, set the source vector,
+    // otherwise verify the extract is from the previously defined source
+    // vector.
+    if (!SrcVec)
+      SrcVec = Op.getOperand(0);
+    else if (SrcVec != Op.getOperand(0))
+      return SDValue();
+    SDValue ExtractedIndex = Op->getOperand(1);
+    // Peek through extends.
+    if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
+        ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
+      ExtractedIndex = ExtractedIndex.getOperand(0);
+    if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // If this is the first extract from the index vector candidate, set the
+    // indices vector, otherwise verify the extract is from the previously
+    // defined indices vector.
+    if (!IndicesVec)
+      IndicesVec = ExtractedIndex.getOperand(0);
+    else if (IndicesVec != ExtractedIndex.getOperand(0))
+      return SDValue();
+
+    auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
+    if (!PermIdx || PermIdx->getZExtValue() != Idx)
+      return SDValue();
+  }
+  MVT IndicesVT = VT;
+  if (VT.isFloatingPoint())
+    IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
+                                 VT.getVectorNumElements());
+  IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
+  return DAG.getNode(VT == MVT::v16i8 ? X86ISD::PSHUFB : X86ISD::VPERMV,
+                     SDLoc(V), VT, IndicesVec, SrcVec);
+}
+
 SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -7922,6 +8034,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (IsAllConstants)
     return SDValue();
 
+  if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
+      return V;
+
   // See if we can use a vector load to get all of the elements.
   if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
@@ -10716,10 +10831,16 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // Try to use byte rotation instructions.
   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
-  if (Subtarget.hasSSSE3())
+  if (Subtarget.hasSSSE3()) {
+    if (Subtarget.hasVLX())
+      if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
+                                                      Mask, Subtarget, DAG))
+        return Rotate;
+
     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
       return Rotate;
+  }
 
   // If we have direct support for blends, we should lower by decomposing into
   // a permute. That will be faster than the domain cross.
@@ -11016,10 +11137,16 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // Try to use byte rotation instructions.
   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
-  if (Subtarget.hasSSSE3())
+  if (Subtarget.hasSSSE3()) {
+    if (Subtarget.hasVLX())
+      if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
+                                                      Mask, Subtarget, DAG))
+        return Rotate;
+
     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
       return Rotate;
+  }
 
   // Assume that a single SHUFPS is faster than an alternative sequence of
   // multiple instructions (even if the CPU has a domain penalty).
@@ -12372,6 +12499,16 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
         return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
       }
     }
+
+    // Try to use SHUF128 if possible.
+    if (Subtarget.hasVLX()) {
+      if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
+        unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
+                            ((WidenedMask[1] % 2) << 1);
+      return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+                         DAG.getConstant(PermMask, DL, MVT::i8));
+      }
+    }
   }
 
   // Otherwise form a 128-bit permutation. After accounting for undefs,
@@ -13697,10 +13834,6 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
-  if (SDValue Shuf128 =
-          lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
-    return Shuf128;
-
   if (V2.isUndef()) {
     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
     // can use lower latency instructions that will operate on all four
@@ -13722,6 +13855,10 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                          getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
   }
 
+  if (SDValue Shuf128 =
+          lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
+    return Shuf128;
+
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
@@ -17333,6 +17470,20 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
 
   if (Swap)
     std::swap(Op0, Op1);
+
+  //  See if it is the case of CMP(EQ|NEQ,AND(A,B),ZERO) and change it to TESTM|NM.
+  if ((!Opc && SSECC == 4) || Opc == X86ISD::PCMPEQM) {
+    SDValue A = peekThroughBitcasts(Op0);
+    if ((A.getOpcode() == ISD::AND || A.getOpcode() == X86ISD::FAND) &&
+        ISD::isBuildVectorAllZeros(Op1.getNode())) {
+      MVT VT0 = Op0.getSimpleValueType();
+      SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0));
+      SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1));
+      return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM,
+                         dl, VT, RHS, LHS);
+    }
+  }
+
   if (Opc)
     return DAG.getNode(Opc, dl, VT, Op0, Op1);
   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
@@ -19838,10 +19989,19 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       else
         PassThru = Src1;
 
-      SDValue Rnd = Op.getOperand(5);
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(5);
+        if (!isRoundModeCurDirection(Rnd))
+          return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
+                                                  Op.getValueType(), Src1, Src2,
+                                                  Src3, Rnd),
+                                      Mask, PassThru, Subtarget, DAG);
+      }
+
       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
                                               Op.getValueType(), Src1, Src2,
-                                              Src3, Rnd),
+                                              Src3),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case IFMA_OP_MASKZ:
@@ -24786,9 +24946,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
   case X86ISD::FMINC:              return "X86ISD::FMINC";
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
-  case X86ISD::FRSQRTS:            return "X86ISD::FRSQRTS";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
-  case X86ISD::FRCPS:              return "X86ISD::FRCPS";
   case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
   case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
@@ -24942,10 +25100,18 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
   case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
   case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
+  case X86ISD::FMADDS1:            return "X86ISD::FMADDS1";
+  case X86ISD::FNMADDS1:           return "X86ISD::FNMADDS1";
+  case X86ISD::FMSUBS1:            return "X86ISD::FMSUBS1";
+  case X86ISD::FNMSUBS1:           return "X86ISD::FNMSUBS1";
   case X86ISD::FMADDS1_RND:        return "X86ISD::FMADDS1_RND";
   case X86ISD::FNMADDS1_RND:       return "X86ISD::FNMADDS1_RND";
   case X86ISD::FMSUBS1_RND:        return "X86ISD::FMSUBS1_RND";
   case X86ISD::FNMSUBS1_RND:       return "X86ISD::FNMSUBS1_RND";
+  case X86ISD::FMADDS3:            return "X86ISD::FMADDS3";
+  case X86ISD::FNMADDS3:           return "X86ISD::FNMADDS3";
+  case X86ISD::FMSUBS3:            return "X86ISD::FMSUBS3";
+  case X86ISD::FNMSUBS3:           return "X86ISD::FNMSUBS3";
   case X86ISD::FMADDS3_RND:        return "X86ISD::FMADDS3_RND";
   case X86ISD::FNMADDS3_RND:       return "X86ISD::FNMADDS3_RND";
   case X86ISD::FMSUBS3_RND:        return "X86ISD::FMSUBS3_RND";
@@ -24966,9 +25132,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::SELECT:             return "X86ISD::SELECT";
   case X86ISD::SELECTS:            return "X86ISD::SELECTS";
   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
+  case X86ISD::RCP14:              return "X86ISD::RCP14";
+  case X86ISD::RCP14S:             return "X86ISD::RCP14S";
   case X86ISD::RCP28:              return "X86ISD::RCP28";
   case X86ISD::RCP28S:             return "X86ISD::RCP28S";
   case X86ISD::EXP2:               return "X86ISD::EXP2";
+  case X86ISD::RSQRT14:            return "X86ISD::RSQRT14";
+  case X86ISD::RSQRT14S:           return "X86ISD::RSQRT14S";
   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
   case X86ISD::RSQRT28S:           return "X86ISD::RSQRT28S";
   case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
@@ -25006,6 +25176,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
   case X86ISD::CVTPS2PH:           return "X86ISD::CVTPS2PH";
   case X86ISD::CVTPH2PS:           return "X86ISD::CVTPH2PS";
+  case X86ISD::CVTPH2PS_RND:       return "X86ISD::CVTPH2PS_RND";
   case X86ISD::CVTP2SI:            return "X86ISD::CVTP2SI";
   case X86ISD::CVTP2UI:            return "X86ISD::CVTP2UI";
   case X86ISD::CVTP2SI_RND:        return "X86ISD::CVTP2SI_RND";
@@ -30314,10 +30485,17 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
                                        TargetLowering::DAGCombinerInfo &DCI,
                                        const X86Subtarget &Subtarget) {
-  if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
+  if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
     return NewOp;
 
-  if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
+  // TODO - Remove this once we can handle the implicit zero-extension of
+  // X86ISD::PEXTRW/X86ISD::PEXTRB in:
+  // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
+  // combineBasicSADPattern.
+  if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+
+  if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
     return NewOp;
 
   SDValue InputVector = N->getOperand(0);
@@ -30464,16 +30642,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-// TODO - merge with combineExtractVectorElt once it can handle the implicit
-// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
-// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
-// combineBasicSADPattern.
-static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
-                                           TargetLowering::DAGCombinerInfo &DCI,
-                                           const X86Subtarget &Subtarget) {
-  return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
-}
-
 /// If a vector select has an operand that is -1 or 0, try to simplify the
 /// select to a bitwise logic operation.
 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
@@ -30674,26 +30842,6 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
 
   unsigned Opcode = Op.getOpcode();
   switch (Opcode) {
-  case X86ISD::PALIGNR:
-    // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
-    if (!VT.is128BitVector())
-      return false;
-    Opcode = X86ISD::VALIGN;
-    LLVM_FALLTHROUGH;
-  case X86ISD::VALIGN: {
-    if (EltVT != MVT::i32 && EltVT != MVT::i64)
-      return false;
-    uint64_t Imm = Op.getConstantOperandVal(2);
-    MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
-    unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
-    unsigned EltSize = EltVT.getSizeInBits();
-    // Make sure we can represent the same shift with the new VT.
-    if ((ShiftAmt % EltSize) != 0)
-      return false;
-    Imm = ShiftAmt / EltSize;
-    return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
-                                    DAG.getConstant(Imm, DL, MVT::i8));
-  }
   case X86ISD::SHUF128: {
     if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
       return false;
@@ -34441,8 +34589,9 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 }
 
-/// This function transforms vector truncation of 'extended sign-bits' values.
-/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
+/// This function transforms vector truncation of 'extended sign-bits' or
+/// 'extended zero-bits' values.
+/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
                                                SelectionDAG &DAG,
                                                const X86Subtarget &Subtarget) {
@@ -34475,10 +34624,19 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
   // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
   unsigned NumSignBits = DAG.ComputeNumSignBits(In);
   unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
-  if (NumSignBits <= (InSVT.getSizeInBits() - NumPackedBits))
-    return SDValue();
+  if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
+    return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
+
+  // Use PACKUS if the input has zero-bits that extend all the way to the
+  // packed/truncated value. e.g. masks, zext_in_reg, etc.
+  KnownBits Known;
+  DAG.computeKnownBits(In, Known);
+  unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
+  NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
+  if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
+    return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
 
-  return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
+  return SDValue();
 }
 
 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
@@ -34507,7 +34665,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
   }
 
-  // Try to truncate extended sign bits with PACKSS.
+  // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
   if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
     return V;
 
@@ -35341,9 +35499,11 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
 
   // Do not convert the passthru input of scalar intrinsics.
   // FIXME: We could allow negations of the lower element only.
-  bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
+  bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
+              N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
   bool NegB = invertIfNegative(B);
-  bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
+  bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
+              N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
 
   // Negative multiplication when NegA xor NegB
   bool NegMul = (NegA != NegB);
@@ -35371,6 +35531,20 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
     case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
     case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
     }
+  } else if (N->getOpcode() == X86ISD::FMADDS1) {
+    switch (NewOpcode) {
+    case ISD::FMA:       NewOpcode = X86ISD::FMADDS1; break;
+    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS1; break;
+    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
+    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
+    }
+  } else if (N->getOpcode() == X86ISD::FMADDS3) {
+    switch (NewOpcode) {
+    case ISD::FMA:       NewOpcode = X86ISD::FMADDS3; break;
+    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS3; break;
+    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
+    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
+    }
   } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
     switch (NewOpcode) {
     case ISD::FMA:       NewOpcode = X86ISD::FMADDS1_RND; break;
@@ -36590,10 +36764,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   switch (N->getOpcode()) {
   default: break;
   case ISD::EXTRACT_VECTOR_ELT:
-    return combineExtractVectorElt(N, DAG, DCI, Subtarget);
   case X86ISD::PEXTRW:
   case X86ISD::PEXTRB:
-    return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
+    return combineExtractVectorElt(N, DAG, DCI, Subtarget);
   case ISD::INSERT_SUBVECTOR:
     return combineInsertSubvector(N, DAG, DCI, Subtarget);
   case ISD::EXTRACT_SUBVECTOR:
@@ -36689,6 +36862,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::FMADD_RND:
   case X86ISD::FMADDS1_RND:
   case X86ISD::FMADDS3_RND:
+  case X86ISD::FMADDS1:
+  case X86ISD::FMADDS3:
   case ISD::FMA:            return combineFMA(N, DAG, Subtarget);
   case ISD::MGATHER:
   case ISD::MSCATTER:       return combineGatherScatter(N, DAG);
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 17cb976a4c7..d1438e59f9b 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -254,7 +254,9 @@ namespace llvm {
       /// Note that these typically require refinement
       /// in order to obtain suitable precision.
       FRSQRT, FRCP,
-      FRSQRTS, FRCPS,
+
+      // AVX-512 reciprocal approximations with a little more precision.
+      RSQRT14, RSQRT14S, RCP14, RCP14S,
 
       // Thread Local Storage.
       TLSADDR,
@@ -487,6 +489,12 @@ namespace llvm {
       FMADDSUB_RND,
       FMSUBADD_RND,
 
+      // Scalar intrinsic FMA.
+      FMADDS1, FMADDS3,
+      FNMADDS1, FNMADDS3,
+      FMSUBS1, FMSUBS3,
+      FNMSUBS1, FNMSUBS3,
+
       // Scalar intrinsic FMA with rounding mode.
       // Two versions, passthru bits on op1 or op3.
       FMADDS1_RND, FMADDS3_RND,
@@ -555,7 +563,7 @@ namespace llvm {
       RSQRT28, RSQRT28S, RCP28, RCP28S, EXP2,
 
       // Conversions between float and half-float.
-      CVTPS2PH, CVTPH2PS,
+      CVTPS2PH, CVTPH2PS, CVTPH2PS_RND,
 
       // LWP insert record.
       LWPINS,
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index a73ee19423d..84b44ac677b 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -6017,16 +6017,17 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
 }
 
 multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
-                            string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
-                            SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> {
+                            string OpcodeStr, SDNode OpNode, SDNode OpNodes1,
+                            SDNode OpNodeRnds1, SDNode OpNodes3,
+                            SDNode OpNodeRnds3, X86VectorVTInfo _,
+                            string SUFF> {
   let ExeDomain = _.ExeDomain in {
   defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix, _,
                 // Operands for intrinsic are in 123 order to preserve passthu
                 // semantics.
-                (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
-                                   (i32 FROUND_CURRENT))),
-                (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2,
-                         _.ScalarIntMemCPat:$src3, (i32 FROUND_CURRENT))),
+                (_.VT (OpNodes1 _.RC:$src1, _.RC:$src2, _.RC:$src3)),
+                (_.VT (OpNodes1 _.RC:$src1, _.RC:$src2,
+                         _.ScalarIntMemCPat:$src3)),
                 (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
                          (i32 imm:$rc))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
@@ -6035,10 +6036,9 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
                          (_.ScalarLdFrag addr:$src3)))), 0>;
 
   defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix, _,
-                (_.VT (OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
-                                   (i32 FROUND_CURRENT))),
-                (_.VT (OpNodeRnds3 _.RC:$src2, _.ScalarIntMemCPat:$src3,
-                              _.RC:$src1, (i32 FROUND_CURRENT))),
+                (_.VT (OpNodes3 _.RC:$src2, _.RC:$src3, _.RC:$src1)),
+                (_.VT (OpNodes3 _.RC:$src2, _.ScalarIntMemCPat:$src3,
+                              _.RC:$src1)),
                 (_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
                                   (i32 imm:$rc))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
@@ -6050,8 +6050,8 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
   // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
   defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix, _,
                 (null_frag),
-                (_.VT (OpNodeRnds1 _.RC:$src1, _.ScalarIntMemCPat:$src3,
-                              _.RC:$src2, (i32 FROUND_CURRENT))),
+                (_.VT (OpNodes1 _.RC:$src1, _.ScalarIntMemCPat:$src3,
+                              _.RC:$src2)),
                 (null_frag),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
                          _.FRC:$src2))),
@@ -6061,26 +6061,29 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
 }
 
 multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
-                        string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
+                        string OpcodeStr, SDNode OpNode, SDNode OpNodes1,
+                        SDNode OpNodeRnds1, SDNode OpNodes3,
                         SDNode OpNodeRnds3> {
   let Predicates = [HasAVX512] in {
     defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
-                                 OpNodeRnds1, OpNodeRnds3, f32x_info, "SS">,
+                                 OpNodes1, OpNodeRnds1, OpNodes3, OpNodeRnds3,
+                                 f32x_info, "SS">,
                                  EVEX_CD8<32, CD8VT1>, VEX_LIG;
     defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
-                                 OpNodeRnds1, OpNodeRnds3, f64x_info, "SD">,
+                                 OpNodes1, OpNodeRnds1, OpNodes3, OpNodeRnds3,
+                                 f64x_info, "SD">,
                                  EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
   }
 }
 
-defm VFMADD  : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnds1,
-                            X86FmaddRnds3>;
-defm VFMSUB  : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnds1,
-                            X86FmsubRnds3>;
-defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd,
-                            X86FnmaddRnds1, X86FnmaddRnds3>;
-defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub,
-                            X86FnmsubRnds1, X86FnmsubRnds3>;
+defm VFMADD  : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86Fmadds1,
+                            X86FmaddRnds1, X86Fmadds3, X86FmaddRnds3>;
+defm VFMSUB  : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86Fmsubs1,
+                            X86FmsubRnds1, X86Fmsubs3, X86FmsubRnds3>;
+defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86Fnmadds1,
+                            X86FnmaddRnds1, X86Fnmadds3, X86FnmaddRnds3>;
+defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86Fnmsubs1,
+                            X86FnmsubRnds1, X86Fnmsubs3, X86FnmsubRnds3>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA
@@ -6554,7 +6557,7 @@ defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd",
                                           NotMemoryFoldable;
 
 def : Pat<(f64 (fpextend FR32X:$src)),
-          (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, FR64X), FR32X:$src)>,
+          (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>,
           Requires<[HasAVX512]>;
 def : Pat<(f64 (fpextend (loadf32 addr:$src))),
           (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
@@ -6569,7 +6572,7 @@ def : Pat<(f64 (extloadf32 addr:$src)),
           Requires<[HasAVX512, OptForSpeed]>;
 
 def : Pat<(f32 (fpround FR64X:$src)),
-          (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, FR32X), FR64X:$src)>,
+          (VCVTSD2SSZrr (f32 (IMPLICIT_DEF)), FR64X:$src)>,
            Requires<[HasAVX512]>;
 
 def : Pat<(v4f32 (X86Movss
@@ -7174,34 +7177,44 @@ def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))),
 //===----------------------------------------------------------------------===//
 multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
                            X86MemOperand x86memop, PatFrag ld_frag> {
-  defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
-                    "vcvtph2ps", "$src", "$src",
-                   (X86cvtph2ps (_src.VT _src.RC:$src),
-                                                (i32 FROUND_CURRENT))>, T8PD;
-  defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src),
-                    "vcvtph2ps", "$src", "$src",
-                    (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))),
-                                     (i32 FROUND_CURRENT))>, T8PD;
+  defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst),
+                            (ins _src.RC:$src), "vcvtph2ps", "$src", "$src",
+                            (X86cvtph2ps (_src.VT _src.RC:$src))>, T8PD;
+  defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
+                            (ins x86memop:$src), "vcvtph2ps", "$src", "$src",
+                            (X86cvtph2ps (_src.VT
+                                          (bitconvert
+                                           (ld_frag addr:$src))))>, T8PD;
 }
 
 multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
-  defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
-                    "vcvtph2ps", "{sae}, $src", "$src, {sae}",
-                   (X86cvtph2ps (_src.VT _src.RC:$src),
-                                                (i32 FROUND_NO_EXC))>, T8PD, EVEX_B;
+  defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest, (outs _dest.RC:$dst),
+                            (ins _src.RC:$src), "vcvtph2ps",
+                            "{sae}, $src", "$src, {sae}",
+                            (X86cvtph2psRnd (_src.VT _src.RC:$src),
+                                            (i32 FROUND_NO_EXC))>, T8PD, EVEX_B;
 
 }
 
-let Predicates = [HasAVX512] in {
+let Predicates = [HasAVX512] in
   defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>,
                     avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>,
                     EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
-  let Predicates = [HasVLX] in {
-    defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
-                         loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
-    defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
-                         loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
-  }
+
+let Predicates = [HasVLX] in {
+  defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
+                       loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+  defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
+                       loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
+
+  // Pattern match vcvtph2ps of a scalar i64 load.
+  def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
+            (VCVTPH2PSZ128rm addr:$src)>;
+  def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
+            (VCVTPH2PSZ128rm addr:$src)>;
+  def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
+              (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (VCVTPH2PSZ128rm addr:$src)>;
 }
 
 multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
@@ -7212,17 +7225,16 @@ multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
                    (X86cvtps2ph (_src.VT _src.RC:$src1),
                                 (i32 imm:$src2)),
                    NoItinerary, 0, 0>, AVX512AIi8Base;
-  def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
-             (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
-             "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-             [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1),
-                                     (i32 imm:$src2))),
-                                     addr:$dst)]>;
-  let hasSideEffects = 0, mayStore = 1 in
-  def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
-             (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
-             "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
-              []>, EVEX_K;
+  let hasSideEffects = 0, mayStore = 1 in {
+    def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
+               (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               []>;
+    def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
+               (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+                []>, EVEX_K;
+  }
 }
 multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
   let hasSideEffects = 0 in
@@ -7242,6 +7254,19 @@ let Predicates = [HasAVX512] in {
     defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem>,
                         EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
   }
+
+  def : Pat<(store (f64 (extractelt
+                         (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))),
+                         (iPTR 0))), addr:$dst),
+            (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>;
+  def : Pat<(store (i64 (extractelt
+                         (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))),
+                         (iPTR 0))), addr:$dst),
+            (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>;
+  def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, i32:$src2)), addr:$dst),
+            (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, imm:$src2)>;
+  def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, i32:$src2)), addr:$dst),
+            (VCVTPS2PHZmr addr:$dst, VR512:$src1, imm:$src2)>;
 }
 
 // Patterns for matching conversions from float to half-float and vice versa.
@@ -7264,35 +7289,6 @@ let Predicates = [HasVLX] in {
               (VCVTPS2PHZ128rr (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), FR32X)) >;
 }
 
-// Patterns for matching float to half-float conversion when AVX512 is supported
-// but F16C isn't. In that case we have to use 512-bit vectors.
-let Predicates = [HasAVX512, NoVLX, NoF16C] in {
-  def : Pat<(fp_to_f16 FR32X:$src),
-            (i16 (EXTRACT_SUBREG
-                  (VMOVPDI2DIZrr
-                   (v8i16 (EXTRACT_SUBREG
-                    (VCVTPS2PHZrr
-                     (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
-                      (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
-                      sub_xmm), 4), sub_xmm))), sub_16bit))>;
-
-  def : Pat<(f16_to_fp GR16:$src),
-            (f32 (COPY_TO_REGCLASS
-                  (v4f32 (EXTRACT_SUBREG
-                   (VCVTPH2PSZrr
-                    (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)),
-                     (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)),
-                     sub_xmm)), sub_xmm)), FR32X))>;
-
-  def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
-            (f32 (COPY_TO_REGCLASS
-                  (v4f32 (EXTRACT_SUBREG
-                          (VCVTPH2PSZrr
-                           (VCVTPS2PHZrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
-                            (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
-                            sub_xmm), 4)), sub_xmm)), FR32X))>;
-}
-
 //  Unordered/Ordered scalar fp compare with Sea and set EFLAGS
 multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _,
                             string OpcodeStr> {
@@ -7362,13 +7358,13 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 }
 
-defm VRCP14SS   : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>,
+defm VRCP14SS   : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, f32x_info>,
                   EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRCP14SD   : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>,
+defm VRCP14SD   : avx512_fp14_s<0x4D, "vrcp14sd", X86rcp14s, f64x_info>,
                   VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRSQRT14SS   : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>,
+defm VRSQRT14SS   : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s, f32x_info>,
                   EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable;
-defm VRSQRT14SD   : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>,
+defm VRSQRT14SD   : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s, f64x_info>,
                   VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable;
 
 /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
@@ -7414,8 +7410,8 @@ multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
   }
 }
 
-defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>;
-defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>;
+defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14>;
+defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14>;
 
 /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
 multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
@@ -7582,7 +7578,8 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
-                              string SUFF, SDNode OpNode, SDNode OpNodeRnd> {
+                              string SUFF, SDNode OpNode, SDNode OpNodeRnd,
+                              Intrinsic Intr> {
   let ExeDomain = _.ExeDomain in {
   defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
@@ -7618,21 +7615,35 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
   }
   }
 
+let Predicates = [HasAVX512] in {
   def : Pat<(_.EltVT (OpNode _.FRC:$src)),
             (!cast<Instruction>(NAME#SUFF#Zr)
                 (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
 
+   def : Pat<(Intr VR128X:$src),
+             (!cast<Instruction>(NAME#SUFF#Zr_Int) VR128X:$src,
+                                 VR128X:$src)>;
+}
+
+let Predicates = [HasAVX512, OptForSize] in {
   def : Pat<(_.EltVT (OpNode (load addr:$src))),
             (!cast<Instruction>(NAME#SUFF#Zm)
-                (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512, OptForSize]>;
+                (_.EltVT (IMPLICIT_DEF)), addr:$src)>;
+
+  def : Pat<(Intr (scalar_to_vector (_.EltVT (load addr:$src2)))),
+            (!cast<Instruction>(NAME#SUFF#Zm_Int)
+                  (_.VT (IMPLICIT_DEF)), addr:$src2)>;
+}
+
 }
 
 multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> {
   defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt,
-                        X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS,
-                        NotMemoryFoldable;
+                        X86fsqrtRnds, int_x86_sse_sqrt_ss>,
+                        EVEX_CD8<32, CD8VT1>, EVEX_4V, XS, NotMemoryFoldable;
   defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt,
-                        X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W,
+                        X86fsqrtRnds, int_x86_sse2_sqrt_sd>,
+                        EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W,
                         NotMemoryFoldable;
 }
 
@@ -7641,19 +7652,6 @@ defm VSQRT   : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>,
 
 defm VSQRT   : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG;
 
-let Predicates = [HasAVX512] in {
-  def : Pat<(f32 (X86frsqrt FR32X:$src)),
-            (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>;
-  def : Pat<(f32 (X86frsqrt (load addr:$src))),
-            (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
-            Requires<[OptForSize]>;
-  def : Pat<(f32 (X86frcp FR32X:$src)),
-            (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>;
-  def : Pat<(f32 (X86frcp (load addr:$src))),
-            (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
-            Requires<[OptForSize]>;
-}
-
 multiclass
 avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
 
@@ -8911,6 +8909,123 @@ defm VPALIGNR:   avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" ,
                                           avx512vl_i8_info, avx512vl_i8_info>,
                 EVEX_CD8<8, CD8VF>;
 
+// Fragments to help convert valignq into masked valignd. Or valignq/valignd
+// into vpalignr.
+def ValignqImm32XForm : SDNodeXForm<imm, [{
+  return getI8Imm(N->getZExtValue() * 2, SDLoc(N));
+}]>;
+def ValignqImm8XForm : SDNodeXForm<imm, [{
+  return getI8Imm(N->getZExtValue() * 8, SDLoc(N));
+}]>;
+def ValigndImm8XForm : SDNodeXForm<imm, [{
+  return getI8Imm(N->getZExtValue() * 4, SDLoc(N));
+}]>;
+
+multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
+                                        X86VectorVTInfo From, X86VectorVTInfo To,
+                                        SDNodeXForm ImmXForm> {
+  def : Pat<(To.VT (vselect To.KRCWM:$mask,
+                            (bitconvert
+                             (From.VT (OpNode From.RC:$src1, From.RC:$src2,
+                                              imm:$src3))),
+                            To.RC:$src0)),
+            (!cast<Instruction>(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask,
+                                                  To.RC:$src1, To.RC:$src2,
+                                                  (ImmXForm imm:$src3))>;
+
+  def : Pat<(To.VT (vselect To.KRCWM:$mask,
+                            (bitconvert
+                             (From.VT (OpNode From.RC:$src1, From.RC:$src2,
+                                              imm:$src3))),
+                            To.ImmAllZerosV)),
+            (!cast<Instruction>(OpcodeStr#"rrikz") To.KRCWM:$mask,
+                                                   To.RC:$src1, To.RC:$src2,
+                                                   (ImmXForm imm:$src3))>;
+
+  def : Pat<(To.VT (vselect To.KRCWM:$mask,
+                            (bitconvert
+                             (From.VT (OpNode From.RC:$src1,
+                                      (bitconvert (To.LdFrag addr:$src2)),
+                                      imm:$src3))),
+                            To.RC:$src0)),
+            (!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask,
+                                                  To.RC:$src1, addr:$src2,
+                                                  (ImmXForm imm:$src3))>;
+
+  def : Pat<(To.VT (vselect To.KRCWM:$mask,
+                            (bitconvert
+                             (From.VT (OpNode From.RC:$src1,
+                                      (bitconvert (To.LdFrag addr:$src2)),
+                                      imm:$src3))),
+                            To.ImmAllZerosV)),
+            (!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask,
+                                                   To.RC:$src1, addr:$src2,
+                                                   (ImmXForm imm:$src3))>;
+}
+
+multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode,
+                                           X86VectorVTInfo From,
+                                           X86VectorVTInfo To,
+                                           SDNodeXForm ImmXForm> :
+      avx512_vpalign_mask_lowering<OpcodeStr, OpNode, From, To, ImmXForm> {
+  def : Pat<(From.VT (OpNode From.RC:$src1,
+                             (bitconvert (To.VT (X86VBroadcast
+                                                (To.ScalarLdFrag addr:$src2)))),
+                             imm:$src3)),
+            (!cast<Instruction>(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2,
+                                                  (ImmXForm imm:$src3))>;
+
+  def : Pat<(To.VT (vselect To.KRCWM:$mask,
+                            (bitconvert
+                             (From.VT (OpNode From.RC:$src1,
+                                      (bitconvert
+                                       (To.VT (X86VBroadcast
+                                               (To.ScalarLdFrag addr:$src2)))),
+                                      imm:$src3))),
+                            To.RC:$src0)),
+            (!cast<Instruction>(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask,
+                                                   To.RC:$src1, addr:$src2,
+                                                   (ImmXForm imm:$src3))>;
+
+  def : Pat<(To.VT (vselect To.KRCWM:$mask,
+                            (bitconvert
+                             (From.VT (OpNode From.RC:$src1,
+                                      (bitconvert
+                                       (To.VT (X86VBroadcast
+                                               (To.ScalarLdFrag addr:$src2)))),
+                                      imm:$src3))),
+                            To.ImmAllZerosV)),
+            (!cast<Instruction>(OpcodeStr#"rmbikz") To.KRCWM:$mask,
+                                                    To.RC:$src1, addr:$src2,
+                                                    (ImmXForm imm:$src3))>;
+}
+
+let Predicates = [HasAVX512] in {
+  // For 512-bit we lower to the widest element type we can. So we only need
+  // to handle converting valignq to valignd.
+  defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ", X86VAlign, v8i64_info,
+                                         v16i32_info, ValignqImm32XForm>;
+}
+
+let Predicates = [HasVLX] in {
+  // For 128-bit we lower to the widest element type we can. So we only need
+  // to handle converting valignq to valignd.
+  defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ128", X86VAlign, v2i64x_info,
+                                         v4i32x_info, ValignqImm32XForm>;
+  // For 256-bit we lower to the widest element type we can. So we only need
+  // to handle converting valignq to valignd.
+  defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ256", X86VAlign, v4i64x_info,
+                                         v8i32x_info, ValignqImm32XForm>;
+}
+
+let Predicates = [HasVLX, HasBWI] in {
+  // We can turn 128 and 256 bit VALIGND/VALIGNQ into VPALIGNR.
+  defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v2i64x_info,
+                                      v16i8x_info, ValignqImm8XForm>;
+  defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v4i32x_info,
+                                      v16i8x_info, ValigndImm8XForm>;
+}
+
 defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" ,
                     avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>;
 
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 453dcd83df1..15466c2978f 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -290,8 +290,7 @@ multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
 }
 
 multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
-                 string OpStr, Intrinsic IntF32, Intrinsic IntF64,
-                 SDNode OpNode> {
+                 string OpStr, SDNode OpNodeIntrin, SDNode OpNode> {
   let ExeDomain = SSEPackedSingle in
   defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", OpNode,
                           FR32, f32mem>,
@@ -309,43 +308,44 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
   // This is because src1 is tied to dest, and the scalar intrinsics
   // require the pass-through values to come from the first source
   // operand, not the second.
-  // TODO: Use AVX512 instructions when possible.
-  let Predicates = [HasFMA] in {
-    def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
+  let Predicates = [HasFMA, NoAVX512] in {
+    def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, VR128:$src2, VR128:$src3)),
               (!cast<Instruction>(NAME#"213SSr_Int")
                VR128:$src1, VR128:$src2, VR128:$src3)>;
 
-    def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
+    def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, VR128:$src2, VR128:$src3)),
               (!cast<Instruction>(NAME#"213SDr_Int")
                VR128:$src1, VR128:$src2, VR128:$src3)>;
 
-    def : Pat<(IntF32 VR128:$src1, VR128:$src2, sse_load_f32:$src3),
+    def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, VR128:$src2,
+                                   sse_load_f32:$src3)),
               (!cast<Instruction>(NAME#"213SSm_Int")
                VR128:$src1, VR128:$src2, sse_load_f32:$src3)>;
 
-    def : Pat<(IntF64 VR128:$src1, VR128:$src2, sse_load_f64:$src3),
+    def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, VR128:$src2,
+                                   sse_load_f64:$src3)),
               (!cast<Instruction>(NAME#"213SDm_Int")
                VR128:$src1, VR128:$src2, sse_load_f64:$src3)>;
 
-    def : Pat<(IntF32 VR128:$src1, sse_load_f32:$src3, VR128:$src2),
+    def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, sse_load_f32:$src3,
+                                   VR128:$src2)),
               (!cast<Instruction>(NAME#"132SSm_Int")
                VR128:$src1, VR128:$src2, sse_load_f32:$src3)>;
 
-    def : Pat<(IntF64 VR128:$src1, sse_load_f64:$src3, VR128:$src2),
+    def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, sse_load_f64:$src3,
+                                   VR128:$src2)),
               (!cast<Instruction>(NAME#"132SDm_Int")
                VR128:$src1, VR128:$src2, sse_load_f64:$src3)>;
   }
 }
 
-defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
-                    int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
-defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
-                    int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86Fmadds1, X86Fmadd>, VEX_LIG;
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsubs1, X86Fmsub>, VEX_LIG;
 
-defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
-                     int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG;
-defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
-                     int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG;
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadds1, X86Fnmadd>,
+                     VEX_LIG;
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsubs1, X86Fnmsub>,
+                     VEX_LIG;
 
 
 //===----------------------------------------------------------------------===//
@@ -385,26 +385,28 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
 }
 
 multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
-                     ComplexPattern mem_cpat, Intrinsic Int> {
+                     ValueType VT, ComplexPattern mem_cpat, SDNode OpNode> {
 let isCodeGenOnly = 1 in {
   def rr_Int : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, VR128:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                [(set VR128:$dst,
-                 (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG;
+                 (VT (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, VEX_W,
+               VEX_LIG;
   def rm_Int : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, memop:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-               [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
-                                  mem_cpat:$src3))]>, VEX_W, VEX_LIG;
+               [(set VR128:$dst, (VT (OpNode VR128:$src1, VR128:$src2,
+                                  mem_cpat:$src3)))]>, VEX_W, VEX_LIG;
   def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
                (ins VR128:$src1, memop:$src2, VR128:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                [(set VR128:$dst,
-                 (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG;
+                 (VT (OpNode VR128:$src1, mem_cpat:$src2, VR128:$src3)))]>,
+               VEX_LIG;
 let hasSideEffects = 0 in
   def rr_Int_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, VR128:$src3),
@@ -475,19 +477,19 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
 let ExeDomain = SSEPackedSingle in {
   // Scalar Instructions
   defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
-                    fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
-                              int_x86_fma_vfmadd_ss>;
+                    fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, sse_load_f32,
+                              X86Fmadds1>;
   defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
-                    fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
-                              int_x86_fma_vfmsub_ss>;
+                    fma4s_int<0x6E, "vfmsubss", ssmem, v4f32, sse_load_f32,
+                              X86Fmsubs1>;
   defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
                           X86Fnmadd, loadf32>,
-                    fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
-                              int_x86_fma_vfnmadd_ss>;
+                    fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32, sse_load_f32,
+                              X86Fnmadds1>;
   defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
                           X86Fnmsub, loadf32>,
-                    fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
-                              int_x86_fma_vfnmsub_ss>;
+                    fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, sse_load_f32,
+                              X86Fnmsubs1>;
   // Packed Instructions
   defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
                             loadv4f32, loadv8f32>;
@@ -506,19 +508,19 @@ let ExeDomain = SSEPackedSingle in {
 let ExeDomain = SSEPackedDouble in {
   // Scalar Instructions
   defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
-                    fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
-                              int_x86_fma_vfmadd_sd>;
+                    fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, sse_load_f64,
+                              X86Fmadds1>;
   defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
-                    fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
-                              int_x86_fma_vfmsub_sd>;
+                    fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64, sse_load_f64,
+                              X86Fmsubs1>;
   defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
                           X86Fnmadd, loadf64>,
-                    fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
-                              int_x86_fma_vfnmadd_sd>;
+                    fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64, sse_load_f64,
+                              X86Fnmadds1>;
   defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
                           X86Fnmsub, loadf64>,
-                    fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
-                              int_x86_fma_vfnmsub_sd>;
+                    fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, sse_load_f64,
+                              X86Fnmsubs1>;
   // Packed Instructions
   defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
                             loadv2f64, loadv4f64>;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index c4f34bdd37e..d30400836bb 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -56,8 +56,6 @@ def X86fxor    : SDNode<"X86ISD::FXOR",      SDTFPBinOp,
 def X86fandn   : SDNode<"X86ISD::FANDN",     SDTFPBinOp>;
 def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
 def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
-def X86frsqrt14s: SDNode<"X86ISD::FRSQRTS",  SDTFPBinOp>;
-def X86frcp14s : SDNode<"X86ISD::FRCPS",    SDTFPBinOp>;
 def X86fhadd   : SDNode<"X86ISD::FHADD",     SDTFPBinOp>;
 def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
 def X86hadd    : SDNode<"X86ISD::HADD",      SDTIntBinOp>;
@@ -482,11 +480,22 @@ def X86FmaddsubRnd  : SDNode<"X86ISD::FMADDSUB_RND",  SDTFmaRound, [SDNPCommutat
 def X86FmsubaddRnd  : SDNode<"X86ISD::FMSUBADD_RND",  SDTFmaRound, [SDNPCommutative]>;
 
 // Scalar FMA intrinsics with passthru bits in operand 1.
+def X86Fmadds1  : SDNode<"X86ISD::FMADDS1",  SDTFPTernaryOp>;
+def X86Fnmadds1 : SDNode<"X86ISD::FNMADDS1", SDTFPTernaryOp>;
+def X86Fmsubs1  : SDNode<"X86ISD::FMSUBS1",  SDTFPTernaryOp>;
+def X86Fnmsubs1 : SDNode<"X86ISD::FNMSUBS1", SDTFPTernaryOp>;
+
+// Scalar FMA intrinsics with passthru bits in operand 1.
 def X86FmaddRnds1   : SDNode<"X86ISD::FMADDS1_RND",     SDTFmaRound>;
 def X86FnmaddRnds1  : SDNode<"X86ISD::FNMADDS1_RND",    SDTFmaRound>;
 def X86FmsubRnds1   : SDNode<"X86ISD::FMSUBS1_RND",     SDTFmaRound>;
 def X86FnmsubRnds1  : SDNode<"X86ISD::FNMSUBS1_RND",    SDTFmaRound>;
 
+def X86Fmadds3  : SDNode<"X86ISD::FMADDS3",  SDTFPTernaryOp>;
+def X86Fnmadds3 : SDNode<"X86ISD::FNMADDS3", SDTFPTernaryOp>;
+def X86Fmsubs3  : SDNode<"X86ISD::FMSUBS3",  SDTFPTernaryOp>;
+def X86Fnmsubs3 : SDNode<"X86ISD::FNMSUBS3", SDTFPTernaryOp>;
+
 // Scalar FMA intrinsics with passthru bits in operand 3.
 def X86FmaddRnds3   : SDNode<"X86ISD::FMADDS3_RND",     SDTFmaRound, [SDNPCommutative]>;
 def X86FnmaddRnds3  : SDNode<"X86ISD::FNMADDS3_RND",    SDTFmaRound, [SDNPCommutative]>;
@@ -498,10 +507,14 @@ def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>,
 def x86vpmadd52l     : SDNode<"X86ISD::VPMADD52L",     SDTIFma, [SDNPCommutative]>;
 def x86vpmadd52h     : SDNode<"X86ISD::VPMADD52H",     SDTIFma, [SDNPCommutative]>;
 
+def X86rsqrt14   : SDNode<"X86ISD::RSQRT14",  SDTFPUnaryOp>;
+def X86rcp14     : SDNode<"X86ISD::RCP14",    SDTFPUnaryOp>;
 def X86rsqrt28   : SDNode<"X86ISD::RSQRT28",  SDTFPUnaryOpRound>;
 def X86rcp28     : SDNode<"X86ISD::RCP28",    SDTFPUnaryOpRound>;
 def X86exp2      : SDNode<"X86ISD::EXP2",     SDTFPUnaryOpRound>;
 
+def X86rsqrt14s  : SDNode<"X86ISD::RSQRT14S",   SDTFPBinOp>;
+def X86rcp14s    : SDNode<"X86ISD::RCP14S",     SDTFPBinOp>;
 def X86rsqrt28s  : SDNode<"X86ISD::RSQRT28S",   SDTFPBinOpRound>;
 def X86rcp28s    : SDNode<"X86ISD::RCP28S",     SDTFPBinOpRound>;
 def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImmRound>;
@@ -578,7 +591,12 @@ def X86VUintToFP      : SDNode<"X86ISD::CVTUI2P",  SDTVintToFP>;
 def X86cvtp2Int      : SDNode<"X86ISD::CVTP2SI",  SDTFloatToInt>;
 def X86cvtp2UInt     : SDNode<"X86ISD::CVTP2UI",  SDTFloatToInt>;
 
+
 def X86cvtph2ps     : SDNode<"X86ISD::CVTPH2PS",
+                              SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
+                                                   SDTCVecEltisVT<1, i16>]> >;
+
+def X86cvtph2psRnd  : SDNode<"X86ISD::CVTPH2PS_RND",
                               SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
                                                    SDTCVecEltisVT<1, i16>,
                                                    SDTCisVT<2, i32>]> >;
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index e665ec1f14d..02a09c340ce 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -18,7 +18,7 @@
 #include "X86InstrFMA3Info.h"
 #include "X86RegisterInfo.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "X86GenInstrInfo.inc"
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 559a8fcf107..f00caa130d0 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -850,7 +850,6 @@ def HasLWP       : Predicate<"Subtarget->hasLWP()">;
 def HasMOVBE     : Predicate<"Subtarget->hasMOVBE()">;
 def HasRDRAND    : Predicate<"Subtarget->hasRDRAND()">;
 def HasF16C      : Predicate<"Subtarget->hasF16C()">;
-def NoF16C       : Predicate<"!Subtarget->hasF16C()">;
 def HasFSGSBase  : Predicate<"Subtarget->hasFSGSBase()">;
 def HasLZCNT     : Predicate<"Subtarget->hasLZCNT()">;
 def HasBMI       : Predicate<"Subtarget->hasBMI()">;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 451303054f5..955a40ee171 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1514,13 +1514,12 @@ let mayLoad = 1 in
 def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
                        (ins FR32:$src1, f64mem:$src2),
                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [], IIC_SSE_CVT_Scalar_RM>,
-                      XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
+                      [], IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, VEX_LIG,
                       Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable;
 }
 
 def : Pat<(f32 (fpround FR64:$src)),
-            (VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>,
+            (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
           Requires<[UseAVX]>;
 
 def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
@@ -1574,20 +1573,18 @@ let hasSideEffects = 0, Predicates = [UseAVX] in {
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
                     (ins FR64:$src1, FR32:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [], IIC_SSE_CVT_Scalar_RR>,
-                    XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
+                    [], IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_LIG,
                     Sched<[WriteCvtF2F]>, VEX_WIG, NotMemoryFoldable;
 let mayLoad = 1 in
 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
                     (ins FR64:$src1, f32mem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [], IIC_SSE_CVT_Scalar_RM>,
-                    XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
+                    [], IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_LIG,
                     Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable;
 }
 
 def : Pat<(f64 (fpextend FR32:$src)),
-    (VCVTSS2SDrr (COPY_TO_REGCLASS FR32:$src, FR64), FR32:$src)>, Requires<[UseAVX]>;
+    (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
 def : Pat<(fpextend (loadf32 addr:$src)),
     (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
 
@@ -1899,7 +1896,7 @@ let Predicates = [HasAVX, NoVLX] in {
                                  (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
               (VCVTTPD2DQrm addr:$src)>;
   }
-} // Predicates = [HasAVX]
+} // Predicates = [HasAVX, NoVLX]
 
 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       "cvttpd2dq\t{$src, $dst|$dst, $src}",
@@ -3095,7 +3092,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
                           ValueType vt, ValueType ScalarVT,
                           X86MemOperand x86memop,
                           Intrinsic Intr, SDNode OpNode, Domain d,
-                          OpndItins itins, string Suffix> {
+                          OpndItins itins, Predicate target, string Suffix> {
   let hasSideEffects = 0 in {
   def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -3126,21 +3123,17 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
   // vrcpss mem, %xmm0, %xmm0
   // TODO: In theory, we could fold the load, and avoid the stall caused by
   // the partial register store, either in ExecutionDepsFix or with smarter RA.
-  let Predicates = [UseAVX] in {
+  let Predicates = [target] in {
    def : Pat<(OpNode RC:$src),  (!cast<Instruction>("V"#NAME#Suffix##r)
                                 (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
-  }
-  let Predicates = [HasAVX] in {
    def : Pat<(Intr VR128:$src),
              (!cast<Instruction>("V"#NAME#Suffix##r_Int) VR128:$src,
                                  VR128:$src)>;
   }
-  let Predicates = [HasAVX, OptForSize] in {
+  let Predicates = [target, OptForSize] in {
     def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
               (!cast<Instruction>("V"#NAME#Suffix##m_Int)
                     (vt (IMPLICIT_DEF)), addr:$src2)>;
-  }
-  let Predicates = [UseAVX, OptForSize] in {
     def : Pat<(ScalarVT (OpNode (load addr:$src))),
               (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
             addr:$src)>;
@@ -3186,7 +3179,7 @@ let Predicates = prds in {
 /// sse2_fp_unop_p - SSE2 unops in vector forms.
 multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
                           SDNode OpNode, OpndItins itins> {
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
   def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        !strconcat("v", OpcodeStr,
                                   "pd\t{$src, $dst|$dst, $src}"),
@@ -3220,41 +3213,41 @@ let Predicates = [HasAVX] in {
 }
 
 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                          OpndItins itins> {
+                          OpndItins itins, Predicate AVXTarget> {
   defm SS        :  sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
                       !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
                       SSEPackedSingle, itins, UseSSE1, "SS">, XS;
   defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
                       f32mem,
                       !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
-                      SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG, VEX_WIG,
-                      NotMemoryFoldable;
+                      SSEPackedSingle, itins, AVXTarget, "SS">, XS, VEX_4V,
+                      VEX_LIG, VEX_WIG, NotMemoryFoldable;
 }
 
 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                          OpndItins itins> {
+                          OpndItins itins, Predicate AVXTarget> {
   defm SD         : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
                          !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
                          OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
   defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
                          f64mem,
                          !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
-                         OpNode, SSEPackedDouble, itins, "SD">,
+                         OpNode, SSEPackedDouble, itins, AVXTarget, "SD">,
                          XD, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
 }
 
 // Square root.
-defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>,
-             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>,
-             sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>,
+defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS, UseAVX>,
+             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX, NoVLX]>,
+             sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD, UseAVX>,
              sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
 
 // Reciprocal approximations. Note that these typically require refinement
 // in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
-             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >;
-defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
-             sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>;
+defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS, HasAVX>,
+             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX]>;
+defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS, HasAVX>,
+             sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX]>;
 
 // There is no f64 version of the reciprocal approximation instructions.
 
@@ -7692,22 +7685,24 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
 //===----------------------------------------------------------------------===//
 // Half precision conversion instructions
 //===----------------------------------------------------------------------===//
-multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop> {
   def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
-             [(set RC:$dst, (Int VR128:$src))]>,
+             [(set RC:$dst, (X86cvtph2ps VR128:$src))]>,
              T8PD, VEX, Sched<[WriteCvtF2F]>;
   let hasSideEffects = 0, mayLoad = 1 in
   def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX,
-             Sched<[WriteCvtF2FLd]>;
+             "vcvtph2ps\t{$src, $dst|$dst, $src}",
+             [(set RC:$dst, (X86cvtph2ps (bc_v8i16
+                                          (loadv2i64 addr:$src))))]>,
+             T8PD, VEX, Sched<[WriteCvtF2FLd]>;
 }
 
-multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop> {
   def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
                (ins RC:$src1, i32u8imm:$src2),
                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-               [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
+               [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>,
                TAPD, VEX, Sched<[WriteCvtF2F]>;
   let hasSideEffects = 0, mayStore = 1,
       SchedRW = [WriteCvtF2FLd, WriteRMW] in
@@ -7717,32 +7712,31 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
                TAPD, VEX;
 }
 
-let Predicates = [HasF16C] in {
-  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
-  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
-  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
-  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
+let Predicates = [HasF16C, NoVLX] in {
+  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem>;
+  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem>, VEX_L;
+  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem>;
+  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem>, VEX_L;
 
   // Pattern match vcvtph2ps of a scalar i64 load.
-  def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
+  def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
             (VCVTPH2PSrm addr:$src)>;
-  def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
+  def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
             (VCVTPH2PSrm addr:$src)>;
-  def : Pat<(int_x86_vcvtph2ps_128 (bitconvert
-              (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+  def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
+              (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
             (VCVTPH2PSrm addr:$src)>;
 
-  def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16
-                  (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
-                   addr:$dst),
-                   (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
-  def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16
-                  (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
-                   addr:$dst),
-                   (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
-  def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)),
-                   addr:$dst),
-                   (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
+  def : Pat<(store (f64 (extractelt
+                         (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
+                         (iPTR 0))), addr:$dst),
+            (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+  def : Pat<(store (i64 (extractelt
+                         (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
+                         (iPTR 0))), addr:$dst),
+            (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+  def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst),
+            (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
 }
 
 // Patterns for  matching conversions from float to half-float and vice versa.
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 6f5e41bcdc6..9edac22d5ba 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -422,12 +422,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
   X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
-  X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
-  X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
-  X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
-  X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
-  X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
-  X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
   X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
@@ -1077,12 +1071,12 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::FSUBS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
                      X86ISD::FSUBS_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM,
+  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK,
                      X86ISD::CVTPH2PS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM,
-                     X86ISD::CVTPH2PS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM,
+  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK,
                      X86ISD::CVTPH2PS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_RND),
   X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK,
                      X86ISD::CVTPS2PH, 0),
   X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK,
@@ -1098,8 +1092,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, ISD::FMA,
                      X86ISD::FMADD_RND),
 
-  X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
   X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
@@ -1220,8 +1214,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, ISD::FMA,
                      X86ISD::FMADD_RND),
 
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND),
   X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
@@ -1239,8 +1233,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
                      X86ISD::FMSUB_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND),
 
   X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
@@ -1259,8 +1253,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
                      X86ISD::FNMSUB_RND),
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND),
   X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ,
                      X86ISD::VFIXUPIMM, 0),
   X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ,
@@ -1298,8 +1292,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, ISD::FMA,
                      X86ISD::FMADD_RND),
 
-  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
   X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
@@ -1428,26 +1422,26 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_ptestnm_w_128, CMP_MASK, X86ISD::TESTNM, 0),
   X86_INTRINSIC_DATA(avx512_ptestnm_w_256, CMP_MASK, X86ISD::TESTNM, 0),
   X86_INTRINSIC_DATA(avx512_ptestnm_w_512, CMP_MASK, X86ISD::TESTNM, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
-  X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
   X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
   X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
   X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
   X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
@@ -1468,6 +1462,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(fma_vfmadd_pd_256,    INTR_TYPE_3OP, ISD::FMA, 0),
   X86_INTRINSIC_DATA(fma_vfmadd_ps,        INTR_TYPE_3OP, ISD::FMA, 0),
   X86_INTRINSIC_DATA(fma_vfmadd_ps_256,    INTR_TYPE_3OP, ISD::FMA, 0),
+  X86_INTRINSIC_DATA(fma_vfmadd_sd,        INTR_TYPE_3OP, X86ISD::FMADDS1, 0),
+  X86_INTRINSIC_DATA(fma_vfmadd_ss,        INTR_TYPE_3OP, X86ISD::FMADDS1, 0),
   X86_INTRINSIC_DATA(fma_vfmaddsub_pd,     INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(fma_vfmaddsub_ps,     INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
@@ -1476,6 +1472,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(fma_vfmsub_pd_256,    INTR_TYPE_3OP, X86ISD::FMSUB, 0),
   X86_INTRINSIC_DATA(fma_vfmsub_ps,        INTR_TYPE_3OP, X86ISD::FMSUB, 0),
   X86_INTRINSIC_DATA(fma_vfmsub_ps_256,    INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmsub_sd,        INTR_TYPE_3OP, X86ISD::FMSUBS1, 0),
+  X86_INTRINSIC_DATA(fma_vfmsub_ss,        INTR_TYPE_3OP, X86ISD::FMSUBS1, 0),
   X86_INTRINSIC_DATA(fma_vfmsubadd_pd,     INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
   X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
   X86_INTRINSIC_DATA(fma_vfmsubadd_ps,     INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
@@ -1484,10 +1482,14 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(fma_vfnmadd_pd_256,   INTR_TYPE_3OP, X86ISD::FNMADD, 0),
   X86_INTRINSIC_DATA(fma_vfnmadd_ps,       INTR_TYPE_3OP, X86ISD::FNMADD, 0),
   X86_INTRINSIC_DATA(fma_vfnmadd_ps_256,   INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfnmadd_sd,       INTR_TYPE_3OP, X86ISD::FNMADDS1, 0),
+  X86_INTRINSIC_DATA(fma_vfnmadd_ss,       INTR_TYPE_3OP, X86ISD::FNMADDS1, 0),
   X86_INTRINSIC_DATA(fma_vfnmsub_pd,       INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
   X86_INTRINSIC_DATA(fma_vfnmsub_pd_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
   X86_INTRINSIC_DATA(fma_vfnmsub_ps,       INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
   X86_INTRINSIC_DATA(fma_vfnmsub_ps_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfnmsub_sd,       INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
+  X86_INTRINSIC_DATA(fma_vfnmsub_ss,       INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
   X86_INTRINSIC_DATA(sse_cmp_ps,        INTR_TYPE_3OP, X86ISD::CMPP, 0),
   X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
@@ -1584,6 +1586,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
   X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
   X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+  X86_INTRINSIC_DATA(vcvtph2ps_128,     INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
+  X86_INTRINSIC_DATA(vcvtph2ps_256,     INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
+  X86_INTRINSIC_DATA(vcvtps2ph_128,     INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
+  X86_INTRINSIC_DATA(vcvtps2ph_256,     INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
   X86_INTRINSIC_DATA(xop_vpcomb,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
   X86_INTRINSIC_DATA(xop_vpcomd,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
   X86_INTRINSIC_DATA(xop_vpcomq,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 98b4863134e..b1438bf7bc0 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -22,6 +22,38 @@
 using namespace llvm;
 using namespace TargetOpcode;
 
+/// FIXME: The following static functions are SizeChangeStrategy functions
+/// that are meant to temporarily mimic the behaviour of the old legalization
+/// based on doubling/halving non-legal types as closely as possible. This is
+/// not entirly possible as only legalizing the types that are exactly a power
+/// of 2 times the size of the legal types would require specifying all those
+/// sizes explicitly.
+/// In practice, not specifying those isn't a problem, and the below functions
+/// should disappear quickly as we add support for legalizing non-power-of-2
+/// sized types further.
+static void
+addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result,
+                                const LegalizerInfo::SizeAndActionsVec &v) {
+  for (unsigned i = 0; i < v.size(); ++i) {
+    result.push_back(v[i]);
+    if (i + 1 < v[i].first && i + 1 < v.size() &&
+        v[i + 1].first != v[i].first + 1)
+      result.push_back({v[i].first + 1, LegalizerInfo::Unsupported});
+  }
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_1(const LegalizerInfo::SizeAndActionsVec &v) {
+  assert(v.size() >= 1);
+  assert(v[0].first > 1);
+  LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::WidenScalar},
+                                             {2, LegalizerInfo::Unsupported}};
+  addAndInterleaveWithUnsupported(result, v);
+  auto Largest = result.back().first;
+  result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+  return result;
+}
+
 X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
                                    const X86TargetMachine &TM)
     : Subtarget(STI), TM(TM) {
@@ -37,6 +69,17 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   setLegalizerInfoAVX512DQ();
   setLegalizerInfoAVX512BW();
 
+  setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1);
+  for (unsigned BinOp : {G_SUB, G_MUL, G_AND, G_OR, G_XOR})
+    setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1);
+  for (unsigned MemOp : {G_LOAD, G_STORE})
+    setLegalizeScalarToDifferentSizeStrategy(MemOp, 0,
+       narrowToSmallerAndWidenToSmallest);
+  setLegalizeScalarToDifferentSizeStrategy(
+      G_GEP, 1, widenToLargerTypesUnsupportedOtherwise);
+  setLegalizeScalarToDifferentSizeStrategy(
+      G_CONSTANT, 0, widenToLargerTypesAndNarrowToLargest);
+
   computeTables();
 }
 
@@ -47,7 +90,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
   const LLT s8 = LLT::scalar(8);
   const LLT s16 = LLT::scalar(16);
   const LLT s32 = LLT::scalar(32);
-  const LLT s64 = LLT::scalar(64);
 
   for (auto Ty : {p0, s1, s8, s16, s32})
     setAction({G_IMPLICIT_DEF, Ty}, Legal);
@@ -55,15 +97,10 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
   for (auto Ty : {s8, s16, s32, p0})
     setAction({G_PHI, Ty}, Legal);
 
-  setAction({G_PHI, s1}, WidenScalar);
-
-  for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) {
+  for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
     for (auto Ty : {s8, s16, s32})
       setAction({BinOp, Ty}, Legal);
 
-    setAction({BinOp, s1}, WidenScalar);
-  }
-
   for (unsigned Op : {G_UADDE}) {
     setAction({Op, s32}, Legal);
     setAction({Op, 1, s1}, Legal);
@@ -73,7 +110,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
     for (auto Ty : {s8, s16, s32, p0})
       setAction({MemOp, Ty}, Legal);
 
-    setAction({MemOp, s1}, WidenScalar);
     // And everything's fine in addrspace 0.
     setAction({MemOp, 1, p0}, Legal);
   }
@@ -85,9 +121,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
   setAction({G_GEP, p0}, Legal);
   setAction({G_GEP, 1, s32}, Legal);
 
-  for (auto Ty : {s1, s8, s16})
-    setAction({G_GEP, 1, Ty}, WidenScalar);
-
   // Control-flow
   setAction({G_BRCOND, s1}, Legal);
 
@@ -95,9 +128,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
   for (auto Ty : {s8, s16, s32, p0})
     setAction({TargetOpcode::G_CONSTANT, Ty}, Legal);
 
-  setAction({TargetOpcode::G_CONSTANT, s1}, WidenScalar);
-  setAction({TargetOpcode::G_CONSTANT, s64}, NarrowScalar);
-
   // Extensions
   for (auto Ty : {s8, s16, s32}) {
     setAction({G_ZEXT, Ty}, Legal);
@@ -105,12 +135,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
     setAction({G_ANYEXT, Ty}, Legal);
   }
 
-  for (auto Ty : {s1, s8, s16}) {
-    setAction({G_ZEXT, 1, Ty}, Legal);
-    setAction({G_SEXT, 1, Ty}, Legal);
-    setAction({G_ANYEXT, 1, Ty}, Legal);
-  }
-
   // Comparison
   setAction({G_ICMP, s1}, Legal);
 
@@ -123,7 +147,6 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
   if (!Subtarget.is64Bit())
     return;
 
-  const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
 
   setAction({G_IMPLICIT_DEF, s64}, Legal);
@@ -145,7 +168,6 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
   // Extensions
   for (unsigned extOp : {G_ZEXT, G_SEXT, G_ANYEXT}) {
     setAction({extOp, s64}, Legal);
-    setAction({extOp, 1, s32}, Legal);
   }
 
   // Comparison
diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp
index 0dd13077c37..67d95c2233d 100644
--- a/lib/Target/X86/X86MacroFusion.cpp
+++ b/lib/Target/X86/X86MacroFusion.cpp
@@ -14,8 +14,8 @@
 
 #include "X86MacroFusion.h"
 #include "X86Subtarget.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
index 3069d1fd349..9b7732c1db8 100644
--- a/lib/Target/X86/X86PadShortFunction.cpp
+++ b/lib/Target/X86/X86PadShortFunction.cpp
@@ -23,10 +23,10 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 1f49650340e..efa0cd2c6bc 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -27,14 +27,14 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index b0ce1335bd3..66fea1e688f 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -161,6 +161,8 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
       // In Regcall calling convention those registers are used for passing
       // parameters. Thus we need to prevent lazy binding in Regcall.
       return X86II::MO_GOTPCREL;
+    if (F && F->hasFnAttribute(Attribute::NonLazyBind) && is64Bit())
+      return X86II::MO_GOTPCREL;
     return X86II::MO_PLT;
   }
 
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index a8d7f290688..a21d068c7f4 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -463,7 +463,7 @@ public:
   bool hasPCLMUL() const { return HasPCLMUL; }
   // Prefer FMA4 to FMA - its better for commutation/memory folding and
   // has equal or better performance on all supported targets.
-  bool hasFMA() const { return (HasFMA || hasAVX512()) && !HasFMA4; }
+  bool hasFMA() const { return HasFMA && !HasFMA4; }
   bool hasFMA4() const { return HasFMA4; }
   bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }
   bool hasXOP() const { return HasXOP; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 6e6c724eb0a..11fe84f162d 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -436,4 +436,11 @@ void X86PassConfig::addPreEmitPass() {
     addPass(createX86FixupLEAs());
     addPass(createX86EvexToVexInsts());
   }
+
+  // Verify basic block incoming and outgoing cfa offset and register values and
+  // correct CFA calculation rule where needed by inserting appropriate CFI
+  // instructions.
+  const Triple &TT = TM->getTargetTriple();
+  if (!TT.isOSDarwin() && !TT.isOSWindows())
+    addPass(createCFIInstrInserter());
 }
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index effbd07fa31..6772d96c799 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1437,7 +1437,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
       return Entry->Cost;
   }
 
-  return BaseT::getCastInstrCost(Opcode, Dst, Src);
+  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
 }
 
 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
@@ -2644,12 +2644,15 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
     { 3, MVT::v8i8,  9 },  //(load 24i8 and) deinterleave into 3 x 8i8
     { 3, MVT::v16i8, 11},  //(load 48i8 and) deinterleave into 3 x 16i8
     { 3, MVT::v32i8, 13},  //(load 96i8 and) deinterleave into 3 x 32i8
+    { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
 
     { 4, MVT::v2i8,  12 }, //(load 8i8 and)   deinterleave into 4 x 2i8
     { 4, MVT::v4i8,  4 },  //(load 16i8 and)  deinterleave into 4 x 4i8
     { 4, MVT::v8i8,  20 }, //(load 32i8 and)  deinterleave into 4 x 8i8
     { 4, MVT::v16i8, 39 }, //(load 64i8 and)  deinterleave into 4 x 16i8
-    { 4, MVT::v32i8, 80 }  //(load 128i8 and) deinterleave into 4 x 32i8
+    { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
+
+    { 8, MVT::v8f32, 40 }  //(load 64f32 and)deinterleave into 8 x 8f32
   };
 
   static const CostTblEntry AVX2InterleavedStoreTbl[] = {
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index fb8c2a71c9a..ba01f1e25ba 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -26,13 +26,13 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <cassert>
 
diff --git a/lib/Target/X86/X86WinAllocaExpander.cpp b/lib/Target/X86/X86WinAllocaExpander.cpp
index fc08f1582ad..8a186e94d9c 100644
--- a/lib/Target/X86/X86WinAllocaExpander.cpp
+++ b/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -25,9 +25,9 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h
index 27584f4e2b6..e98e9cda11d 100644
--- a/lib/Target/XCore/XCoreFrameLowering.h
+++ b/lib/Target/XCore/XCoreFrameLowering.h
@@ -15,7 +15,7 @@
 #ifndef LLVM_LIB_TARGET_XCORE_XCOREFRAMELOWERING_H
 #define LLVM_LIB_TARGET_XCORE_XCOREFRAMELOWERING_H
 
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index a377784caf4..9d9ee33ce22 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -15,7 +15,7 @@
 #define LLVM_LIB_TARGET_XCORE_XCOREINSTRINFO_H
 
 #include "XCoreRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "XCoreGenInstrInfo.inc"
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index d34e928b14f..a6cf6837009 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -30,7 +30,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 12090bff381..4bb2984e3b4 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -448,9 +448,13 @@ static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV,
   for (auto *GVE : GVs) {
     DIVariable *Var = GVE->getVariable();
     DIExpression *Expr = GVE->getExpression();
-    if (NumElements > 1)
-      Expr = DIExpression::createFragmentExpression(Expr, FragmentOffsetInBits,
-                                                    FragmentSizeInBits);
+    if (NumElements > 1) {
+      if (auto E = DIExpression::createFragmentExpression(
+              Expr, FragmentOffsetInBits, FragmentSizeInBits))
+        Expr = *E;
+      else
+        return;
+    }
     auto *NGVE = DIGlobalVariableExpression::get(GVE->getContext(), Var, Expr);
     NGV->addDebugInfo(NGVE);
   }
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index 9fa5ed9ab2b..6cef866b7b8 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1401,7 +1401,7 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
       FAlias->takeName(F);
       if (FAlias->hasName())
         F->setName(FAlias->getName() + ".cfi");
-      F->replaceAllUsesWith(FAlias);
+      F->replaceUsesExceptBlockAddr(FAlias);
     }
     if (!F->isDeclarationForLinker())
       F->setLinkage(GlobalValue::InternalLinkage);
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index b5267f75e41..c47d8b78df3 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -931,15 +931,17 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
     if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE))
       continue;
 
-    ORE.emit([&]() {
-      return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined",
-                                CS.getInstruction())
-             << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
-             << ore::NV("Caller", CS.getCaller());
-    });
+    // Construct remark before doing the inlining, as after successful inlining
+    // the callsite is removed.
+    OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction());
+    OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
+       << ore::NV("Caller", CS.getCaller());
 
     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
-    InlineFunction(CS, IFI);
+    if (!InlineFunction(CS, IFI))
+      continue;
+
+    ORE.emit(OR);
 
     // Now update the entry count:
     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 828eb5eee29..5d373665509 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -467,6 +467,9 @@ void PassManagerBuilder::populateModulePassManager(
 
   addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
 
+  if (OptLevel > 2)
+    MPM.add(createCallSiteSplittingPass());
+
   MPM.add(createIPSCCPPass());          // IP SCCP
   MPM.add(createCalledValuePropagationPass());
   MPM.add(createGlobalOptimizerPass()); // Optimize out global vars
@@ -545,6 +548,9 @@ void PassManagerBuilder::populateModulePassManager(
   // unrolling/vectorization/... now. We'll first run the inliner + CGSCC passes
   // during ThinLTO and perform the rest of the optimizations afterward.
   if (PrepareForThinLTO) {
+    // Ensure we perform any last passes, but do so before renaming anonymous
+    // globals in case the passes add any.
+    addExtensionsToPM(EP_OptimizerLast, MPM);
     // Rename anon globals to be able to export them in the summary.
     MPM.add(createNameAnonGlobalPass());
     return;
@@ -703,6 +709,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   PM.add(createInferFunctionAttrsLegacyPass());
 
   if (OptLevel > 1) {
+    // Split call-site with more constrained arguments.
+    PM.add(createCallSiteSplittingPass());
+
     // Indirect call promotion. This should promote all the targets that are
     // left by the earlier promotion pass that promotes intra-module targets.
     // This two-step promotion is to save the compile time. For LTO, it should
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index 34414f96cca..8930e9b2b95 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -1182,24 +1182,20 @@ void SampleProfileLoader::buildEdges(Function &F) {
   }
 }
 
-/// Sorts the CallTargetMap \p M by count in descending order and stores the
-/// sorted result in \p Sorted. Returns the total counts.
-static uint64_t SortCallTargets(SmallVector<InstrProfValueData, 2> &Sorted,
-                                const SampleRecord::CallTargetMap &M) {
-  Sorted.clear();
-  uint64_t Sum = 0;
-  for (auto I = M.begin(); I != M.end(); ++I) {
-    Sum += I->getValue();
-    Sorted.push_back({Function::getGUID(I->getKey()), I->getValue()});
-  }
-  std::sort(Sorted.begin(), Sorted.end(),
+/// Returns the sorted CallTargetMap \p M by count in descending order.
+static SmallVector<InstrProfValueData, 2> SortCallTargets(
+    const SampleRecord::CallTargetMap &M) {
+  SmallVector<InstrProfValueData, 2> R;
+  for (auto I = M.begin(); I != M.end(); ++I)
+    R.push_back({Function::getGUID(I->getKey()), I->getValue()});
+  std::sort(R.begin(), R.end(),
             [](const InstrProfValueData &L, const InstrProfValueData &R) {
               if (L.Count == R.Count)
                 return L.Value > R.Value;
               else
                 return L.Count > R.Count;
             });
-  return Sum;
+  return R;
 }
 
 /// \brief Propagate weights into edges
@@ -1292,8 +1288,10 @@ void SampleProfileLoader::propagateWeights(Function &F) {
           auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
           if (!T || T.get().empty())
             continue;
-          SmallVector<InstrProfValueData, 2> SortedCallTargets;
-          uint64_t Sum = SortCallTargets(SortedCallTargets, T.get());
+          SmallVector<InstrProfValueData, 2> SortedCallTargets =
+              SortCallTargets(T.get());
+          uint64_t Sum;
+          findIndirectCallFunctionSamples(I, Sum);
           annotateValueSite(*I.getParent()->getParent()->getParent(), I,
                             SortedCallTargets, Sum, IPVK_IndirectCallTarget,
                             SortedCallTargets.size());
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 18b246b5d99..d28d615f47e 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -482,7 +482,7 @@ Value *FAddCombine::performFactorization(Instruction *I) {
     return nullptr;
 
   FastMathFlags Flags;
-  Flags.setUnsafeAlgebra();
+  Flags.setFast();
   if (I0) Flags &= I->getFastMathFlags();
   if (I1) Flags &= I->getFastMathFlags();
 
@@ -511,7 +511,7 @@ Value *FAddCombine::performFactorization(Instruction *I) {
 }
 
 Value *FAddCombine::simplify(Instruction *I) {
-  assert(I->hasUnsafeAlgebra() && "Should be in unsafe mode");
+  assert(I->isFast() && "Expected 'fast' instruction");
 
   // Currently we are not able to handle vector type.
   if (I->getType()->isVectorTy())
@@ -1386,7 +1386,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
   if (Value *V = SimplifySelectsFeedingBinaryOp(I, LHS, RHS))
     return replaceInstUsesWith(I, V);
 
-  if (I.hasUnsafeAlgebra()) {
+  if (I.isFast()) {
     if (Value *V = FAddCombine(Builder).simplify(&I))
       return replaceInstUsesWith(I, V);
   }
@@ -1736,7 +1736,7 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
   if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
     return replaceInstUsesWith(I, V);
 
-  if (I.hasUnsafeAlgebra()) {
+  if (I.isFast()) {
     if (Value *V = FAddCombine(Builder).simplify(&I))
       return replaceInstUsesWith(I, V);
   }
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 7a4abc9aca0..a00e6f73ab8 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2017,7 +2017,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   }
   case Intrinsic::fmuladd: {
     // Canonicalize fast fmuladd to the separate fmul + fadd.
-    if (II->hasUnsafeAlgebra()) {
+    if (II->isFast()) {
       BuilderTy::FastMathFlagGuard Guard(Builder);
       Builder.setFastMathFlags(II->getFastMathFlags());
       Value *Mul = Builder.CreateFMul(II->getArgOperand(0),
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index cb4788576c5..2974449830d 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -426,8 +426,7 @@ Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
 
     // Look for an appropriate type:
     // - The type of Idx if the magic fits
-    // - The smallest fitting legal type if we have a DataLayout
-    // - Default to i32
+    // - The smallest fitting legal type
     if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth())
       Ty = Idx->getType();
     else
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index e6b97538267..87666360c1a 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -487,7 +487,7 @@ static void detectLog2OfHalf(Value *&Op, Value *&Y, IntrinsicInst *&Log2) {
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op);
   if (!II)
     return;
-  if (II->getIntrinsicID() != Intrinsic::log2 || !II->hasUnsafeAlgebra())
+  if (II->getIntrinsicID() != Intrinsic::log2 || !II->isFast())
     return;
   Log2 = II;
 
@@ -498,7 +498,8 @@ static void detectLog2OfHalf(Value *&Op, Value *&Y, IntrinsicInst *&Log2) {
   Instruction *I = dyn_cast<Instruction>(OpLog2Of);
   if (!I)
     return;
-  if (I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra())
+
+  if (I->getOpcode() != Instruction::FMul || !I->isFast())
     return;
 
   if (match(I->getOperand(0), m_SpecificFP(0.5)))
@@ -601,7 +602,7 @@ Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, Constant *C,
   }
 
   if (R) {
-    R->setHasUnsafeAlgebra(true);
+    R->setFast(true);
     InsertNewInstWith(R, *InsertBefore);
   }
 
@@ -622,7 +623,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
                                   SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
-  bool AllowReassociate = I.hasUnsafeAlgebra();
+  bool AllowReassociate = I.isFast();
 
   // Simplify mul instructions with a constant RHS.
   if (isa<Constant>(Op1)) {
@@ -1341,7 +1342,7 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
       if (Instruction *R = FoldOpIntoSelect(I, SI))
         return R;
 
-  bool AllowReassociate = I.hasUnsafeAlgebra();
+  bool AllowReassociate = I.isFast();
   bool AllowReciprocal = I.hasAllowReciprocal();
 
   if (Constant *Op1C = dyn_cast<Constant>(Op1)) {
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 45541c9adc0..44bbb84686a 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -310,6 +310,40 @@ static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
   }
 }
 
+// If this is a bitwise operator or add with a constant RHS we might be able
+// to pull it through a shift.
+static bool canShiftBinOpWithConstantRHS(BinaryOperator &Shift,
+                                         BinaryOperator *BO,
+                                         const APInt &C) {
+  bool IsValid = true;     // Valid only for And, Or Xor,
+  bool HighBitSet = false; // Transform ifhigh bit of constant set?
+
+  switch (BO->getOpcode()) {
+  default: IsValid = false; break;   // Do not perform transform!
+  case Instruction::Add:
+    IsValid = Shift.getOpcode() == Instruction::Shl;
+    break;
+  case Instruction::Or:
+  case Instruction::Xor:
+    HighBitSet = false;
+    break;
+  case Instruction::And:
+    HighBitSet = true;
+    break;
+  }
+
+  // If this is a signed shift right, and the high bit is modified
+  // by the logical operation, do not perform the transformation.
+  // The HighBitSet boolean indicates the value of the high bit of
+  // the constant which would cause it to be modified for this
+  // operation.
+  //
+  if (IsValid && Shift.getOpcode() == Instruction::AShr)
+    IsValid = C.isNegative() == HighBitSet;
+
+  return IsValid;
+}
+
 Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
                                                BinaryOperator &I) {
   bool isLeftShift = I.getOpcode() == Instruction::Shl;
@@ -472,33 +506,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
       // shift is the only use, we can pull it out of the shift.
       const APInt *Op0C;
       if (match(Op0BO->getOperand(1), m_APInt(Op0C))) {
-        bool isValid = true;     // Valid only for And, Or, Xor
-        bool highBitSet = false; // Transform if high bit of constant set?
-
-        switch (Op0BO->getOpcode()) {
-        default: isValid = false; break;   // Do not perform transform!
-        case Instruction::Add:
-          isValid = isLeftShift;
-          break;
-        case Instruction::Or:
-        case Instruction::Xor:
-          highBitSet = false;
-          break;
-        case Instruction::And:
-          highBitSet = true;
-          break;
-        }
-
-        // If this is a signed shift right, and the high bit is modified
-        // by the logical operation, do not perform the transformation.
-        // The highBitSet boolean indicates the value of the high bit of
-        // the constant which would cause it to be modified for this
-        // operation.
-        //
-        if (isValid && I.getOpcode() == Instruction::AShr)
-          isValid = Op0C->isNegative() == highBitSet;
-
-        if (isValid) {
+        if (canShiftBinOpWithConstantRHS(I, Op0BO, *Op0C)) {
           Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
                                      cast<Constant>(Op0BO->getOperand(1)), Op1);
 
@@ -525,6 +533,53 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
         return BinaryOperator::CreateSub(NewRHS, NewShift);
       }
     }
+
+    // If we have a select that conditionally executes some binary operator,
+    // see if we can pull it the select and operator through the shift.
+    //
+    // For example, turning:
+    //   shl (select C, (add X, C1), X), C2
+    // Into:
+    //   Y = shl X, C2
+    //   select C, (add Y, C1 << C2), Y
+    Value *Cond;
+    BinaryOperator *TBO;
+    Value *FalseVal;
+    if (match(Op0, m_Select(m_Value(Cond), m_OneUse(m_BinOp(TBO)),
+                            m_Value(FalseVal)))) {
+      const APInt *C;
+      if (!isa<Constant>(FalseVal) && TBO->getOperand(0) == FalseVal &&
+          match(TBO->getOperand(1), m_APInt(C)) &&
+          canShiftBinOpWithConstantRHS(I, TBO, *C)) {
+        Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
+                                       cast<Constant>(TBO->getOperand(1)), Op1);
+
+        Value *NewShift =
+          Builder.CreateBinOp(I.getOpcode(), FalseVal, Op1);
+        Value *NewOp = Builder.CreateBinOp(TBO->getOpcode(), NewShift,
+                                           NewRHS);
+        return SelectInst::Create(Cond, NewOp, NewShift);
+      }
+    }
+
+    BinaryOperator *FBO;
+    Value *TrueVal;
+    if (match(Op0, m_Select(m_Value(Cond), m_Value(TrueVal),
+                            m_OneUse(m_BinOp(FBO))))) {
+      const APInt *C;
+      if (!isa<Constant>(TrueVal) && FBO->getOperand(0) == TrueVal &&
+          match(FBO->getOperand(1), m_APInt(C)) && 
+          canShiftBinOpWithConstantRHS(I, FBO, *C)) {
+        Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
+                                       cast<Constant>(FBO->getOperand(1)), Op1);
+
+        Value *NewShift =
+          Builder.CreateBinOp(I.getOpcode(), TrueVal, Op1);
+        Value *NewOp = Builder.CreateBinOp(FBO->getOpcode(), NewShift,
+                                           NewRHS);
+        return SelectInst::Create(Cond, NewShift, NewOp);
+      }
+    }
   }
 
   return nullptr;
diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 11a43e803a9..c92d48396c8 100644
--- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -844,8 +844,9 @@ public:
   PGOUseFunc(Function &Func, Module *Modu,
              std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
              BranchProbabilityInfo *BPI = nullptr,
-             BlockFrequencyInfo *BFI = nullptr)
-      : F(Func), M(Modu), FuncInfo(Func, ComdatMembers, false, BPI, BFI),
+             BlockFrequencyInfo *BFIin = nullptr)
+      : F(Func), M(Modu), BFI(BFIin),
+        FuncInfo(Func, ComdatMembers, false, BPI, BFIin),
         FreqAttr(FFA_Normal) {}
 
   // Read counts for the instrumented BB from profile.
@@ -863,6 +864,9 @@ public:
   // Annotate the value profile call sites for one value kind.
   void annotateValueSites(uint32_t Kind);
 
+  // Annotate the irreducible loop header weights.
+  void annotateIrrLoopHeaderWeights();
+
   // The hotness of the function from the profile count.
   enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot };
 
@@ -894,6 +898,7 @@ public:
 private:
   Function &F;
   Module *M;
+  BlockFrequencyInfo *BFI;
 
   // This member stores the shared information with class PGOGenFunc.
   FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo;
@@ -1183,6 +1188,18 @@ void PGOUseFunc::setBranchWeights() {
   }
 }
 
+void PGOUseFunc::annotateIrrLoopHeaderWeights() {
+  DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n");
+  // Find irr loop headers
+  for (auto &BB : F) {
+    if (BFI->isIrrLoopHeader(&BB)) {
+      TerminatorInst *TI = BB.getTerminator();
+      const UseBBInfo &BBCountInfo = getBBInfo(&BB);
+      setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue);
+    }
+  }
+}
+
 void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
   Module *M = F.getParent();
   IRBuilder<> Builder(&SI);
@@ -1441,6 +1458,7 @@ static bool annotateAllFunctions(
     Func.populateCounters();
     Func.setBranchWeights();
     Func.annotateValueSites();
+    Func.annotateIrrLoopHeaderWeights();
     PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr();
     if (FreqAttr == PGOUseFunc::FFA_Cold)
       ColdFunctions.push_back(&F);
@@ -1582,6 +1600,12 @@ void llvm::setProfMetadata(Module *M, Instruction *TI,
 
 namespace llvm {
 
+void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count) {
+  MDBuilder MDB(M->getContext());
+  TI->setMetadata(llvm::LLVMContext::MD_irr_loop,
+                  MDB.createIrrLoopHeaderWeight(Count));
+}
+
 template <> struct GraphTraits<PGOUseFunc *> {
   using NodeRef = const BasicBlock *;
   using ChildIteratorType = succ_const_iterator;
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index f04d0f05ffc..1e683db5020 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -118,7 +119,8 @@ class AggressiveDeadCodeElimination {
   PostDominatorTree &PDT;
 
   /// Mapping of blocks to associated information, an element in BlockInfoVec.
-  DenseMap<BasicBlock *, BlockInfoType> BlockInfo;
+  /// Use MapVector to get deterministic iteration order.
+  MapVector<BasicBlock *, BlockInfoType> BlockInfo;
   bool isLive(BasicBlock *BB) { return BlockInfo[BB].Live; }
 
   /// Mapping of instructions to associated information.
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index d79ae851005..6a27fbca8b7 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_library(LLVMScalarOpts
   ADCE.cpp
   AlignmentFromAssumptions.cpp
   BDCE.cpp
+  CallSiteSplitting.cpp
   ConstantHoisting.cpp
   ConstantProp.cpp
   CorrelatedValuePropagation.cpp
diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp
new file mode 100644
index 00000000000..b70ed8d7d4c
--- /dev/null
+++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -0,0 +1,492 @@
+//===- CallSiteSplitting.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a transformation that tries to split a call-site to pass
+// more constrained arguments if its argument is predicated in the control flow
+// so that we can expose better context to the later passes (e.g, inliner, jump
+// threading, or IPA-CP based function cloning, etc.).
+// As of now we support two cases :
+//
+// 1) If a call site is dominated by an OR condition and if any of its arguments
+// are predicated on this OR condition, try to split the condition with more
+// constrained arguments. For example, in the code below, we try to split the
+// call site since we can predicate the argument(ptr) based on the OR condition.
+//
+// Split from :
+//   if (!ptr || c)
+//     callee(ptr);
+// to :
+//   if (!ptr)
+//     callee(null)         // set the known constant value
+//   else if (c)
+//     callee(nonnull ptr)  // set non-null attribute in the argument
+//
+// 2) We can also split a call-site based on constant incoming values of a PHI
+// For example,
+// from :
+//   Header:
+//    %c = icmp eq i32 %i1, %i2
+//    br i1 %c, label %Tail, label %TBB
+//   TBB:
+//    br label Tail%
+//   Tail:
+//    %p = phi i32 [ 0, %Header], [ 1, %TBB]
+//    call void @bar(i32 %p)
+// to
+//   Header:
+//    %c = icmp eq i32 %i1, %i2
+//    br i1 %c, label %Tail-split0, label %TBB
+//   TBB:
+//    br label %Tail-split1
+//   Tail-split0:
+//    call void @bar(i32 0)
+//    br label %Tail
+//   Tail-split1:
+//    call void @bar(i32 1)
+//    br label %Tail
+//   Tail:
+//    %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ]
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/CallSiteSplitting.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "callsite-splitting"
+
+STATISTIC(NumCallSiteSplit, "Number of call-site split");
+
+static void addNonNullAttribute(Instruction *CallI, Instruction *&NewCallI,
+                                Value *Op) {
+  if (!NewCallI)
+    NewCallI = CallI->clone();
+  CallSite CS(NewCallI);
+  unsigned ArgNo = 0;
+  for (auto &I : CS.args()) {
+    if (&*I == Op)
+      CS.addParamAttr(ArgNo, Attribute::NonNull);
+    ++ArgNo;
+  }
+}
+
+static void setConstantInArgument(Instruction *CallI, Instruction *&NewCallI,
+                                  Value *Op, Constant *ConstValue) {
+  if (!NewCallI)
+    NewCallI = CallI->clone();
+  CallSite CS(NewCallI);
+  unsigned ArgNo = 0;
+  for (auto &I : CS.args()) {
+    if (&*I == Op)
+      CS.setArgument(ArgNo, ConstValue);
+    ++ArgNo;
+  }
+}
+
+static bool createCallSitesOnOrPredicatedArgument(
+    CallSite CS, Instruction *&NewCSTakenFromHeader,
+    Instruction *&NewCSTakenFromNextCond,
+    SmallVectorImpl<BranchInst *> &BranchInsts, BasicBlock *HeaderBB) {
+  assert(BranchInsts.size() <= 2 &&
+         "Unexpected number of blocks in the OR predicated condition");
+  Instruction *Instr = CS.getInstruction();
+  BasicBlock *CallSiteBB = Instr->getParent();
+  TerminatorInst *HeaderTI = HeaderBB->getTerminator();
+  bool IsCSInTakenPath = CallSiteBB == HeaderTI->getSuccessor(0);
+
+  for (unsigned I = 0, E = BranchInsts.size(); I != E; ++I) {
+    BranchInst *PBI = BranchInsts[I];
+    assert(isa<ICmpInst>(PBI->getCondition()) &&
+           "Unexpected condition in a conditional branch.");
+    ICmpInst *Cmp = cast<ICmpInst>(PBI->getCondition());
+    Value *Arg = Cmp->getOperand(0);
+    assert(isa<Constant>(Cmp->getOperand(1)) &&
+           "Expected op1 to be a constant.");
+    Constant *ConstVal = cast<Constant>(Cmp->getOperand(1));
+    CmpInst::Predicate Pred = Cmp->getPredicate();
+
+    if (PBI->getParent() == HeaderBB) {
+      Instruction *&CallTakenFromHeader =
+          IsCSInTakenPath ? NewCSTakenFromHeader : NewCSTakenFromNextCond;
+      Instruction *&CallUntakenFromHeader =
+          IsCSInTakenPath ? NewCSTakenFromNextCond : NewCSTakenFromHeader;
+
+      assert((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
+             "Unexpected predicate in an OR condition");
+
+      // Set the constant value for agruments in the call predicated based on
+      // the OR condition.
+      Instruction *&CallToSetConst = Pred == ICmpInst::ICMP_EQ
+                                         ? CallTakenFromHeader
+                                         : CallUntakenFromHeader;
+      setConstantInArgument(Instr, CallToSetConst, Arg, ConstVal);
+
+      // Add the NonNull attribute if compared with the null pointer.
+      if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) {
+        Instruction *&CallToSetAttr = Pred == ICmpInst::ICMP_EQ
+                                          ? CallUntakenFromHeader
+                                          : CallTakenFromHeader;
+        addNonNullAttribute(Instr, CallToSetAttr, Arg);
+      }
+      continue;
+    }
+
+    if (Pred == ICmpInst::ICMP_EQ) {
+      if (PBI->getSuccessor(0) == Instr->getParent()) {
+        // Set the constant value for the call taken from the second block in
+        // the OR condition.
+        setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal);
+      } else {
+        // Add the NonNull attribute if compared with the null pointer for the
+        // call taken from the second block in the OR condition.
+        if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue())
+          addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg);
+      }
+    } else {
+      if (PBI->getSuccessor(0) == Instr->getParent()) {
+        // Add the NonNull attribute if compared with the null pointer for the
+        // call taken from the second block in the OR condition.
+        if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue())
+          addNonNullAttribute(Instr, NewCSTakenFromNextCond, Arg);
+      } else if (Pred == ICmpInst::ICMP_NE) {
+        // Set the constant value for the call in the untaken path from the
+        // header block.
+        setConstantInArgument(Instr, NewCSTakenFromNextCond, Arg, ConstVal);
+      } else
+        llvm_unreachable("Unexpected condition");
+    }
+  }
+  return NewCSTakenFromHeader || NewCSTakenFromNextCond;
+}
+
+static bool canSplitCallSite(CallSite CS) {
+  // FIXME: As of now we handle only CallInst. InvokeInst could be handled
+  // without too much effort.
+  Instruction *Instr = CS.getInstruction();
+  if (!isa<CallInst>(Instr))
+    return false;
+
+  // Allow splitting a call-site only when there is no instruction before the
+  // call-site in the basic block. Based on this constraint, we only clone the
+  // call instruction, and we do not move a call-site across any other
+  // instruction.
+  BasicBlock *CallSiteBB = Instr->getParent();
+  if (Instr != CallSiteBB->getFirstNonPHI())
+    return false;
+
+  pred_iterator PII = pred_begin(CallSiteBB);
+  pred_iterator PIE = pred_end(CallSiteBB);
+  unsigned NumPreds = std::distance(PII, PIE);
+
+  // Allow only one extra call-site. No more than two from one call-site.
+  if (NumPreds != 2)
+    return false;
+
+  // Cannot split an edge from an IndirectBrInst.
+  BasicBlock *Preds[2] = {*PII++, *PII};
+  if (isa<IndirectBrInst>(Preds[0]->getTerminator()) ||
+      isa<IndirectBrInst>(Preds[1]->getTerminator()))
+    return false;
+
+  return CallSiteBB->canSplitPredecessors();
+}
+
+/// Return true if the CS is split into its new predecessors which are directly
+/// hooked to each of its orignial predecessors pointed by PredBB1 and PredBB2.
+/// Note that PredBB1 and PredBB2 are decided in findPredicatedArgument(),
+/// especially for the OR predicated case where PredBB1 will point the header,
+/// and PredBB2 will point the the second compare block. CallInst1 and CallInst2
+/// will be the new call-sites placed in the new predecessors split for PredBB1
+/// and PredBB2, repectively. Therefore, CallInst1 will be the call-site placed
+/// between Header and Tail, and CallInst2 will be the call-site between TBB and
+/// Tail. For example, in the IR below with an OR condition, the call-site can
+/// be split
+///
+/// from :
+///
+///   Header:
+///     %c = icmp eq i32* %a, null
+///     br i1 %c %Tail, %TBB
+///   TBB:
+///     %c2 = icmp eq i32* %b, null
+///     br i1 %c %Tail, %End
+///   Tail:
+///     %ca = call i1  @callee (i32* %a, i32* %b)
+///
+///  to :
+///
+///   Header:                          // PredBB1 is Header
+///     %c = icmp eq i32* %a, null
+///     br i1 %c %Tail-split1, %TBB
+///   TBB:                             // PredBB2 is TBB
+///     %c2 = icmp eq i32* %b, null
+///     br i1 %c %Tail-split2, %End
+///   Tail-split1:
+///     %ca1 = call @callee (i32* null, i32* %b)         // CallInst1
+///    br %Tail
+///   Tail-split2:
+///     %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2
+///    br %Tail
+///   Tail:
+///    %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2]
+///
+/// Note that for an OR predicated case, CallInst1 and CallInst2 should be
+/// created with more constrained arguments in
+/// createCallSitesOnOrPredicatedArgument().
+static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,
+                          Instruction *CallInst1, Instruction *CallInst2) {
+  Instruction *Instr = CS.getInstruction();
+  BasicBlock *TailBB = Instr->getParent();
+  assert(Instr == (TailBB->getFirstNonPHI()) && "Unexpected call-site");
+
+  BasicBlock *SplitBlock1 =
+      SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split");
+  BasicBlock *SplitBlock2 =
+      SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split");
+
+  assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split.");
+
+  if (!CallInst1)
+    CallInst1 = Instr->clone();
+  if (!CallInst2)
+    CallInst2 = Instr->clone();
+
+  CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt());
+  CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt());
+
+  CallSite CS1(CallInst1);
+  CallSite CS2(CallInst2);
+
+  // Handle PHIs used as arguments in the call-site.
+  for (auto &PI : *TailBB) {
+    PHINode *PN = dyn_cast<PHINode>(&PI);
+    if (!PN)
+      break;
+    unsigned ArgNo = 0;
+    for (auto &CI : CS.args()) {
+      if (&*CI == PN) {
+        CS1.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock1));
+        CS2.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock2));
+      }
+      ++ArgNo;
+    }
+  }
+
+  // Replace users of the original call with a PHI mering call-sites split.
+  if (Instr->getNumUses()) {
+    PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call", Instr);
+    PN->addIncoming(CallInst1, SplitBlock1);
+    PN->addIncoming(CallInst2, SplitBlock2);
+    Instr->replaceAllUsesWith(PN);
+  }
+  DEBUG(dbgs() << "split call-site : " << *Instr << " into \n");
+  DEBUG(dbgs() << "    " << *CallInst1 << " in " << SplitBlock1->getName()
+               << "\n");
+  DEBUG(dbgs() << "    " << *CallInst2 << " in " << SplitBlock2->getName()
+               << "\n");
+  Instr->eraseFromParent();
+  NumCallSiteSplit++;
+}
+
+static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) {
+  assert(isa<Constant>(Cmp->getOperand(1)) && "Expected a constant operand.");
+  Value *Op0 = Cmp->getOperand(0);
+  unsigned ArgNo = 0;
+  for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
+       ++I, ++ArgNo) {
+    // Don't consider constant or arguments that are already known non-null.
+    if (isa<Constant>(*I) || CS.paramHasAttr(ArgNo, Attribute::NonNull))
+      continue;
+
+    if (*I == Op0)
+      return true;
+  }
+  return false;
+}
+
+static void findOrCondRelevantToCallArgument(
+    CallSite CS, BasicBlock *PredBB, BasicBlock *OtherPredBB,
+    SmallVectorImpl<BranchInst *> &BranchInsts, BasicBlock *&HeaderBB) {
+  auto *PBI = dyn_cast<BranchInst>(PredBB->getTerminator());
+  if (!PBI || !PBI->isConditional())
+    return;
+
+  if (PBI->getSuccessor(0) == OtherPredBB ||
+      PBI->getSuccessor(1) == OtherPredBB)
+    if (PredBB == OtherPredBB->getSinglePredecessor()) {
+      assert(!HeaderBB && "Expect to find only a single header block");
+      HeaderBB = PredBB;
+    }
+
+  CmpInst::Predicate Pred;
+  Value *Cond = PBI->getCondition();
+  if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant())))
+    return;
+  ICmpInst *Cmp = cast<ICmpInst>(Cond);
+  if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)
+    if (isCondRelevantToAnyCallArgument(Cmp, CS))
+      BranchInsts.push_back(PBI);
+}
+
+// Return true if the call-site has an argument which is a PHI with only
+// constant incoming values.
+static bool isPredicatedOnPHI(CallSite CS) {
+  Instruction *Instr = CS.getInstruction();
+  BasicBlock *Parent = Instr->getParent();
+  if (Instr != Parent->getFirstNonPHI())
+    return false;
+
+  for (auto &BI : *Parent) {
+    if (PHINode *PN = dyn_cast<PHINode>(&BI)) {
+      for (auto &I : CS.args())
+        if (&*I == PN) {
+          assert(PN->getNumIncomingValues() == 2 &&
+                 "Unexpected number of incoming values");
+          if (PN->getIncomingBlock(0) == PN->getIncomingBlock(1))
+            return false;
+          if (PN->getIncomingValue(0) == PN->getIncomingValue(1))
+            continue;
+          if (isa<Constant>(PN->getIncomingValue(0)) &&
+              isa<Constant>(PN->getIncomingValue(1)))
+            return true;
+        }
+    }
+    break;
+  }
+  return false;
+}
+
+// Return true if an agument in CS is predicated on an 'or' condition.
+// Create new call-site with arguments constrained based on the OR condition.
+static bool findPredicatedOnOrCondition(CallSite CS, BasicBlock *PredBB1,
+                                        BasicBlock *PredBB2,
+                                        Instruction *&NewCallTakenFromHeader,
+                                        Instruction *&NewCallTakenFromNextCond,
+                                        BasicBlock *&HeaderBB) {
+  SmallVector<BranchInst *, 4> BranchInsts;
+  findOrCondRelevantToCallArgument(CS, PredBB1, PredBB2, BranchInsts, HeaderBB);
+  findOrCondRelevantToCallArgument(CS, PredBB2, PredBB1, BranchInsts, HeaderBB);
+  if (BranchInsts.empty() || !HeaderBB)
+    return false;
+
+  // If an OR condition is detected, try to create call sites with constrained
+  // arguments (e.g., NonNull attribute or constant value).
+  return createCallSitesOnOrPredicatedArgument(CS, NewCallTakenFromHeader,
+                                               NewCallTakenFromNextCond,
+                                               BranchInsts, HeaderBB);
+}
+
+static bool findPredicatedArgument(CallSite CS, Instruction *&CallInst1,
+                                   Instruction *&CallInst2,
+                                   BasicBlock *&PredBB1, BasicBlock *&PredBB2) {
+  BasicBlock *CallSiteBB = CS.getInstruction()->getParent();
+  pred_iterator PII = pred_begin(CallSiteBB);
+  pred_iterator PIE = pred_end(CallSiteBB);
+  assert(std::distance(PII, PIE) == 2 && "Expect only two predecessors.");
+  (void)PIE;
+  BasicBlock *Preds[2] = {*PII++, *PII};
+  BasicBlock *&HeaderBB = PredBB1;
+  if (!findPredicatedOnOrCondition(CS, Preds[0], Preds[1], CallInst1, CallInst2,
+                                   HeaderBB) &&
+      !isPredicatedOnPHI(CS))
+    return false;
+
+  if (!PredBB1)
+    PredBB1 = Preds[0];
+
+  PredBB2 = PredBB1 == Preds[0] ? Preds[1] : Preds[0];
+  return true;
+}
+
+static bool tryToSplitCallSite(CallSite CS) {
+  if (!CS.arg_size())
+    return false;
+
+  BasicBlock *PredBB1 = nullptr;
+  BasicBlock *PredBB2 = nullptr;
+  Instruction *CallInst1 = nullptr;
+  Instruction *CallInst2 = nullptr;
+  if (!canSplitCallSite(CS) ||
+      !findPredicatedArgument(CS, CallInst1, CallInst2, PredBB1, PredBB2)) {
+    assert(!CallInst1 && !CallInst2 && "Unexpected new call-sites cloned.");
+    return false;
+  }
+  splitCallSite(CS, PredBB1, PredBB2, CallInst1, CallInst2);
+  return true;
+}
+
+static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) {
+  bool Changed = false;
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) {
+    BasicBlock &BB = *BI++;
+    for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) {
+      Instruction *I = &*II++;
+      CallSite CS(cast<Value>(I));
+      if (!CS || isa<IntrinsicInst>(I) || isInstructionTriviallyDead(I, &TLI))
+        continue;
+
+      Function *Callee = CS.getCalledFunction();
+      if (!Callee || Callee->isDeclaration())
+        continue;
+      Changed |= tryToSplitCallSite(CS);
+    }
+  }
+  return Changed;
+}
+
+namespace {
+struct CallSiteSplittingLegacyPass : public FunctionPass {
+  static char ID;
+  CallSiteSplittingLegacyPass() : FunctionPass(ID) {
+    initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    return doCallSiteSplitting(F, TLI);
+  }
+};
+} // namespace
+
+char CallSiteSplittingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting",
+                      "Call-site splitting", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting",
+                    "Call-site splitting", false, false)
+FunctionPass *llvm::createCallSiteSplittingPass() {
+  return new CallSiteSplittingLegacyPass();
+}
+
+PreservedAnalyses CallSiteSplittingPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+
+  if (!doCallSiteSplitting(F, TLI))
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  return PA;
+}
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 9ce42a06825..abb50f27f1c 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -48,6 +48,7 @@
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
@@ -1624,6 +1625,15 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
     if (DU.NarrowDef->use_empty())
       DeadInsts.emplace_back(DU.NarrowDef);
   }
+
+  // Attach any debug information to the new PHI. Since OrigPhi and WidePHI
+  // evaluate the same recurrence, we can just copy the debug info over.
+  SmallVector<DbgValueInst *, 1> DbgValues;
+  llvm::findDbgValues(DbgValues, OrigPhi);
+  auto *MDPhi = MetadataAsValue::get(WidePhi->getContext(),
+                                     ValueAsMetadata::get(WidePhi));
+  for (auto &DbgValue : DbgValues)
+    DbgValue->setOperand(0, MDPhi);
   return WidePhi;
 }
 
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index ade4fbbcb6f..e6cab3f34cf 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -192,11 +192,12 @@ JumpThreadingPass::JumpThreadingPass(int T) {
 //     P(cond == true ) = P(A) + P(cond == true | B) * P(B)
 //
 //  which gives us:
-//     P(A) <= P(c == true), i.e.
+//     P(A) is less than P(cond == true), i.e.
 //     P(t == true) <= P(cond == true)
 //
-//  In other words, if we know P(cond == true), we know that P(t == true)
-//  can not be greater than 1%.
+//  In other words, if we know P(cond == true) is unlikely, we know 
+//  that P(t == true) is also unlikely.
+//
 static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
   BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
   if (!CondBr)
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 6ca8d602302..c60ec9f50f7 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -62,6 +62,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -93,9 +94,8 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
 static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
                   const LoopSafetyInfo *SafetyInfo,
                   OptimizationRemarkEmitter *ORE);
-static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
-                 const Loop *CurLoop, AliasSetTracker *CurAST,
-                 const LoopSafetyInfo *SafetyInfo,
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
+                 const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
                  OptimizationRemarkEmitter *ORE);
 static bool isSafeToExecuteUnconditionally(Instruction &Inst,
                                            const DominatorTree *DT,
@@ -394,8 +394,12 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
       //
       if (isNotUsedInLoop(I, CurLoop, SafetyInfo) &&
           canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) {
-        ++II;
-        Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo, ORE);
+        if (sink(I, LI, DT, CurLoop, SafetyInfo, ORE)) {
+          ++II;
+          CurAST->deleteValue(&I);
+          I.eraseFromParent();
+          Changed = true;
+        }
       }
     }
   }
@@ -717,26 +721,6 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
         if (!BlockColors.empty() &&
             BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1)
           return false;
-
-      // A PHI node where all of the incoming values are this instruction are
-      // special -- they can just be RAUW'ed with the instruction and thus
-      // don't require a use in the predecessor. This is a particular important
-      // special case because it is the pattern found in LCSSA form.
-      if (isTriviallyReplacablePHI(*PN, I)) {
-        if (CurLoop->contains(PN))
-          return false;
-        else
-          continue;
-      }
-
-      // Otherwise, PHI node uses occur in predecessor blocks if the incoming
-      // values. Check for such a use being inside the loop.
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-        if (PN->getIncomingValue(i) == &I)
-          if (CurLoop->contains(PN->getIncomingBlock(i)))
-            return false;
-
-      continue;
     }
 
     if (CurLoop->contains(UI))
@@ -806,14 +790,96 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
   return New;
 }
 
+static Instruction *sinkThroughTriviallyReplacablePHI(
+    PHINode *TPN, Instruction *I, LoopInfo *LI,
+    SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
+    const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop) {
+  assert(isTriviallyReplacablePHI(*TPN, *I) &&
+         "Expect only trivially replacalbe PHI");
+  BasicBlock *ExitBlock = TPN->getParent();
+  Instruction *New;
+  auto It = SunkCopies.find(ExitBlock);
+  if (It != SunkCopies.end())
+    New = It->second;
+  else
+    New = SunkCopies[ExitBlock] =
+        CloneInstructionInExitBlock(*I, *ExitBlock, *TPN, LI, SafetyInfo);
+  return New;
+}
+
+static bool canSplitPredecessors(PHINode *PN) {
+  BasicBlock *BB = PN->getParent();
+  if (!BB->canSplitPredecessors())
+    return false;
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    BasicBlock *BBPred = *PI;
+    if (isa<IndirectBrInst>(BBPred->getTerminator()))
+      return false;
+  }
+  return true;
+}
+
+static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
+                                        LoopInfo *LI, const Loop *CurLoop) {
+#ifndef NDEBUG
+  SmallVector<BasicBlock *, 32> ExitBlocks;
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+  SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
+                                             ExitBlocks.end());
+#endif
+  BasicBlock *ExitBB = PN->getParent();
+  assert(ExitBlockSet.count(ExitBB) && "Expect the PHI is in an exit block.");
+
+  // Split predecessors of the loop exit to make instructions in the loop are
+  // exposed to exit blocks through trivially replacable PHIs while keeping the
+  // loop in the canonical form where each predecessor of each exit block should
+  // be contained within the loop. For example, this will convert the loop below
+  // from
+  //
+  // LB1:
+  //   %v1 =
+  //   br %LE, %LB2
+  // LB2:
+  //   %v2 =
+  //   br %LE, %LB1
+  // LE:
+  //   %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replacable
+  //
+  // to
+  //
+  // LB1:
+  //   %v1 =
+  //   br %LE.split, %LB2
+  // LB2:
+  //   %v2 =
+  //   br %LE.split2, %LB1
+  // LE.split:
+  //   %p1 = phi [%v1, %LB1]  <-- trivially replacable
+  //   br %LE
+  // LE.split2:
+  //   %p2 = phi [%v2, %LB2]  <-- trivially replacable
+  //   br %LE
+  // LE:
+  //   %p = phi [%p1, %LE.split], [%p2, %LE.split2]
+  //
+  SmallSetVector<BasicBlock *, 8> PredBBs(pred_begin(ExitBB), pred_end(ExitBB));
+  while (!PredBBs.empty()) {
+    BasicBlock *PredBB = *PredBBs.begin();
+    assert(CurLoop->contains(PredBB) &&
+           "Expect all predecessors are in the loop");
+    if (PN->getBasicBlockIndex(PredBB) >= 0)
+      SplitBlockPredecessors(ExitBB, PredBB, ".split.loop.exit", DT, LI, true);
+    PredBBs.remove(PredBB);
+  }
+}
+
 /// When an instruction is found to only be used outside of the loop, this
 /// function moves it to the exit blocks and patches up SSA form as needed.
 /// This method is guaranteed to remove the original instruction from its
 /// position, and may either delete it or move it to outside of the loop.
 ///
-static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
-                 const Loop *CurLoop, AliasSetTracker *CurAST,
-                 const LoopSafetyInfo *SafetyInfo,
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
+                 const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
                  OptimizationRemarkEmitter *ORE) {
   DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
   ORE->emit([&]() {
@@ -828,57 +894,75 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
   ++NumSunk;
   Changed = true;
 
-#ifndef NDEBUG
-  SmallVector<BasicBlock *, 32> ExitBlocks;
-  CurLoop->getUniqueExitBlocks(ExitBlocks);
-  SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
-                                             ExitBlocks.end());
-#endif
+  // Iterate over users to be ready for actual sinking. Replace users via
+  // unrechable blocks with undef and make all user PHIs trivially replcable.
+  SmallPtrSet<Instruction *, 8> VisitedUsers;
+  for (Value::user_iterator UI = I.user_begin(), UE = I.user_end(); UI != UE;) {
+    auto *User = cast<Instruction>(*UI);
+    Use &U = UI.getUse();
+    ++UI;
 
-  // Clones of this instruction. Don't create more than one per exit block!
-  SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies;
+    if (VisitedUsers.count(User))
+      continue;
 
-  // If this instruction is only used outside of the loop, then all users are
-  // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
-  // the instruction.
-  while (!I.use_empty()) {
-    Value::user_iterator UI = I.user_begin();
-    auto *User = cast<Instruction>(*UI);
     if (!DT->isReachableFromEntry(User->getParent())) {
       User->replaceUsesOfWith(&I, UndefValue::get(I.getType()));
       continue;
     }
+
     // The user must be a PHI node.
     PHINode *PN = cast<PHINode>(User);
 
     // Surprisingly, instructions can be used outside of loops without any
     // exits.  This can only happen in PHI nodes if the incoming block is
     // unreachable.
-    Use &U = UI.getUse();
     BasicBlock *BB = PN->getIncomingBlock(U);
     if (!DT->isReachableFromEntry(BB)) {
       U = UndefValue::get(I.getType());
       continue;
     }
 
-    BasicBlock *ExitBlock = PN->getParent();
-    assert(ExitBlockSet.count(ExitBlock) &&
-           "The LCSSA PHI is not in an exit block!");
+    VisitedUsers.insert(PN);
+    if (isTriviallyReplacablePHI(*PN, I))
+      continue;
 
-    Instruction *New;
-    auto It = SunkCopies.find(ExitBlock);
-    if (It != SunkCopies.end())
-      New = It->second;
-    else
-      New = SunkCopies[ExitBlock] =
-          CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI, SafetyInfo);
+    if (!canSplitPredecessors(PN))
+      return false;
+
+    // Split predecessors of the PHI so that we can make users trivially
+    // replacable.
+    splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop);
 
+    // Should rebuild the iterators, as they may be invalidated by
+    // splitPredecessorsOfLoopExit().
+    UI = I.user_begin();
+    UE = I.user_end();
+  }
+
+#ifndef NDEBUG
+  SmallVector<BasicBlock *, 32> ExitBlocks;
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+  SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
+                                             ExitBlocks.end());
+#endif
+
+  // Clones of this instruction. Don't create more than one per exit block!
+  SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies;
+
+  // If this instruction is only used outside of the loop, then all users are
+  // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
+  // the instruction.
+  while (!I.use_empty()) {
+    Value::user_iterator UI = I.user_begin();
+    PHINode *PN = cast<PHINode>(*UI);
+    assert(ExitBlockSet.count(PN->getParent()) &&
+           "The LCSSA PHI is not in an exit block!");
+    // The PHI must be trivially replacable.
+    Instruction *New = sinkThroughTriviallyReplacablePHI(PN, &I, LI, SunkCopies,
+                                                         SafetyInfo, CurLoop);
     PN->replaceAllUsesWith(New);
     PN->eraseFromParent();
   }
-
-  CurAST->deleteValue(&I);
-  I.eraseFromParent();
   return Changed;
 }
 
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 413fb75d172..eb5f3cc47ce 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1326,9 +1326,9 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
   // step 2: detect instructions corresponding to "x.next = x >> 1"
   if (!DefX || DefX->getOpcode() != Instruction::AShr)
     return false;
-  if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)))
-    if (!Shft || !Shft->isOne())
-      return false;
+  ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
+  if (!Shft || !Shft->isOne())
+    return false;
   VarX = DefX->getOperand(0);
 
   // step 3: Check the recurrence of variable X
diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp
index 9a623be234f..52dea3254e7 100644
--- a/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/lib/Transforms/Scalar/LoopPredication.cpp
@@ -174,6 +174,9 @@
 
 using namespace llvm;
 
+static cl::opt<bool> EnableIVTruncation("loop-predication-enable-iv-truncation",
+                                        cl::Hidden, cl::init(true));
+
 namespace {
 class LoopPredication {
   /// Represents an induction variable check:
@@ -186,6 +189,10 @@ class LoopPredication {
              const SCEV *Limit)
         : Pred(Pred), IV(IV), Limit(Limit) {}
     LoopICmp() {}
+    void dump() {
+      dbgs() << "LoopICmp Pred = " << Pred << ", IV = " << *IV
+             << ", Limit = " << *Limit << "\n";
+    }
   };
 
   ScalarEvolution *SE;
@@ -195,6 +202,7 @@ class LoopPredication {
   BasicBlock *Preheader;
   LoopICmp LatchCheck;
 
+  bool isSupportedStep(const SCEV* Step);
   Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI) {
     return parseLoopICmp(ICI->getPredicate(), ICI->getOperand(0),
                          ICI->getOperand(1));
@@ -204,14 +212,36 @@ class LoopPredication {
 
   Optional<LoopICmp> parseLoopLatchICmp();
 
+  bool CanExpand(const SCEV* S);
   Value *expandCheck(SCEVExpander &Expander, IRBuilder<> &Builder,
                      ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
                      Instruction *InsertAt);
 
   Optional<Value *> widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander,
                                         IRBuilder<> &Builder);
+  Optional<Value *> widenICmpRangeCheckIncrementingLoop(LoopICmp LatchCheck,
+                                                        LoopICmp RangeCheck,
+                                                        SCEVExpander &Expander,
+                                                        IRBuilder<> &Builder);
+
   bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
 
+  // When the IV type is wider than the range operand type, we can still do loop
+  // predication, by generating SCEVs for the range and latch that are of the
+  // same type. We achieve this by generating a SCEV truncate expression for the
+  // latch IV. This is done iff truncation of the IV is a safe operation,
+  // without loss of information.
+  // Another way to achieve this is by generating a wider type SCEV for the
+  // range check operand, however, this needs a more involved check that
+  // operands do not overflow. This can lead to loss of information when the
+  // range operand is of the form: add i32 %offset, %iv. We need to prove that
+  // sext(x + y) is same as sext(x) + sext(y).
+  // This function returns true if we can safely represent the IV type in
+  // the RangeCheckType without loss of information.
+  bool isSafeToTruncateWideIVType(Type *RangeCheckType);
+  // Return the loopLatchCheck corresponding to the RangeCheckType if safe to do
+  // so.
+  Optional<LoopICmp> generateLoopLatchCheck(Type *RangeCheckType);
 public:
   LoopPredication(ScalarEvolution *SE) : SE(SE){};
   bool runOnLoop(Loop *L);
@@ -301,53 +331,54 @@ Value *LoopPredication::expandCheck(SCEVExpander &Expander,
   return Builder.CreateICmp(Pred, LHSV, RHSV);
 }
 
-/// If ICI can be widened to a loop invariant condition emits the loop
-/// invariant condition in the loop preheader and return it, otherwise
-/// returns None.
-Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
-                                                       SCEVExpander &Expander,
-                                                       IRBuilder<> &Builder) {
-  DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
-  DEBUG(ICI->dump());
+Optional<LoopPredication::LoopICmp>
+LoopPredication::generateLoopLatchCheck(Type *RangeCheckType) {
 
-  // parseLoopStructure guarantees that the latch condition is:
-  //   ++i <pred> latchLimit, where <pred> is u<, u<=, s<, or s<=.
-  // We are looking for the range checks of the form:
-  //   i u< guardLimit
-  auto RangeCheck = parseLoopICmp(ICI);
-  if (!RangeCheck) {
-    DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+  auto *LatchType = LatchCheck.IV->getType();
+  if (RangeCheckType == LatchType)
+    return LatchCheck;
+  // For now, bail out if latch type is narrower than range type.
+  if (DL->getTypeSizeInBits(LatchType) < DL->getTypeSizeInBits(RangeCheckType))
     return None;
-  }
-  if (RangeCheck->Pred != ICmpInst::ICMP_ULT) {
-    DEBUG(dbgs() << "Unsupported range check predicate(" << RangeCheck->Pred
-                 << ")!\n");
+  if (!isSafeToTruncateWideIVType(RangeCheckType))
     return None;
-  }
-  auto *RangeCheckIV = RangeCheck->IV;
-  auto *Ty = RangeCheckIV->getType();
-  if (Ty != LatchCheck.IV->getType()) {
-    DEBUG(dbgs() << "Type mismatch between range check and latch IVs!\n");
+  // We can now safely identify the truncated version of the IV and limit for
+  // RangeCheckType.
+  LoopICmp NewLatchCheck;
+  NewLatchCheck.Pred = LatchCheck.Pred;
+  NewLatchCheck.IV = dyn_cast<SCEVAddRecExpr>(
+      SE->getTruncateExpr(LatchCheck.IV, RangeCheckType));
+  if (!NewLatchCheck.IV)
     return None;
-  }
-  if (!RangeCheckIV->isAffine()) {
-    DEBUG(dbgs() << "Range check IV is not affine!\n");
-    return None;
-  }
-  auto *Step = RangeCheckIV->getStepRecurrence(*SE);
-  if (Step != LatchCheck.IV->getStepRecurrence(*SE)) {
-    DEBUG(dbgs() << "Range check and latch have IVs different steps!\n");
-    return None;
-  }
-  assert(Step->isOne() && "must be one");
+  NewLatchCheck.Limit = SE->getTruncateExpr(LatchCheck.Limit, RangeCheckType);
+  DEBUG(dbgs() << "IV of type: " << *LatchType
+               << "can be represented as range check type:" << *RangeCheckType
+               << "\n");
+  DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n");
+  DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n");
+  return NewLatchCheck;
+}
+
+bool LoopPredication::isSupportedStep(const SCEV* Step) {
+  return Step->isOne();
+}
 
-  // Generate the widened condition:
+bool LoopPredication::CanExpand(const SCEV* S) {
+  return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE);
+}
+
+Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop(
+    LoopPredication::LoopICmp LatchCheck, LoopPredication::LoopICmp RangeCheck,
+    SCEVExpander &Expander, IRBuilder<> &Builder) {
+  auto *Ty = RangeCheck.IV->getType();
+  // Generate the widened condition for the forward loop:
   //   guardStart u< guardLimit &&
   //   latchLimit <pred> guardLimit - 1 - guardStart + latchStart
   // where <pred> depends on the latch condition predicate. See the file
   // header comment for the reasoning.
-  const SCEV *GuardStart = RangeCheckIV->getStart();
-  const SCEV *GuardLimit = RangeCheck->Limit;
+  // guardLimit - guardStart + latchStart - 1
+  const SCEV *GuardStart = RangeCheck.IV->getStart();
+  const SCEV *GuardLimit = RangeCheck.Limit;
   const SCEV *LatchStart = LatchCheck.IV->getStart();
   const SCEV *LatchLimit = LatchCheck.Limit;
 
@@ -355,7 +386,11 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
   const SCEV *RHS =
       SE->getAddExpr(SE->getMinusSCEV(GuardLimit, GuardStart),
                      SE->getMinusSCEV(LatchStart, SE->getOne(Ty)));
-
+  if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) ||
+      !CanExpand(LatchLimit) || !CanExpand(RHS)) {
+    DEBUG(dbgs() << "Can't expand limit check!\n");
+    return None;
+  }
   ICmpInst::Predicate LimitCheckPred;
   switch (LatchCheck.Pred) {
   case ICmpInst::ICMP_ULT:
@@ -378,22 +413,68 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
   DEBUG(dbgs() << "RHS: " << *RHS << "\n");
   DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n");
 
-  auto CanExpand = [this](const SCEV *S) {
-    return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE);
-  };
-  if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) ||
-      !CanExpand(LatchLimit) || !CanExpand(RHS)) {
-    DEBUG(dbgs() << "Can't expand limit check!\n");
-    return None;
-  }
-
   Instruction *InsertAt = Preheader->getTerminator();
   auto *LimitCheck =
       expandCheck(Expander, Builder, LimitCheckPred, LatchLimit, RHS, InsertAt);
-  auto *FirstIterationCheck = expandCheck(Expander, Builder, RangeCheck->Pred,
+  auto *FirstIterationCheck = expandCheck(Expander, Builder, RangeCheck.Pred,
                                           GuardStart, GuardLimit, InsertAt);
   return Builder.CreateAnd(FirstIterationCheck, LimitCheck);
 }
+/// If ICI can be widened to a loop invariant condition emits the loop
+/// invariant condition in the loop preheader and return it, otherwise
+/// returns None.
+Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
+                                                       SCEVExpander &Expander,
+                                                       IRBuilder<> &Builder) {
+  DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
+  DEBUG(ICI->dump());
+
+  // parseLoopStructure guarantees that the latch condition is:
+  //   ++i <pred> latchLimit, where <pred> is u<, u<=, s<, or s<=.
+  // We are looking for the range checks of the form:
+  //   i u< guardLimit
+  auto RangeCheck = parseLoopICmp(ICI);
+  if (!RangeCheck) {
+    DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+    return None;
+  }
+  DEBUG(dbgs() << "Guard check:\n");
+  DEBUG(RangeCheck->dump());
+  if (RangeCheck->Pred != ICmpInst::ICMP_ULT) {
+    DEBUG(dbgs() << "Unsupported range check predicate(" << RangeCheck->Pred
+                 << ")!\n");
+    return None;
+  }
+  auto *RangeCheckIV = RangeCheck->IV;
+  if (!RangeCheckIV->isAffine()) {
+    DEBUG(dbgs() << "Range check IV is not affine!\n");
+    return None;
+  }
+  auto *Step = RangeCheckIV->getStepRecurrence(*SE);
+  // We cannot just compare with latch IV step because the latch and range IVs
+  // may have different types.
+  if (!isSupportedStep(Step)) {
+    DEBUG(dbgs() << "Range check and latch have IVs different steps!\n");
+    return None;
+  }
+  auto *Ty = RangeCheckIV->getType();
+  auto CurrLatchCheckOpt = generateLoopLatchCheck(Ty);
+  if (!CurrLatchCheckOpt) {
+    DEBUG(dbgs() << "Failed to generate a loop latch check "
+                    "corresponding to range type: "
+                 << *Ty << "\n");
+    return None;
+  }
+
+  LoopICmp CurrLatchCheck = *CurrLatchCheckOpt;
+  // At this point the range check step and latch step should have the same
+  // value and type.
+  assert(Step == CurrLatchCheck.IV->getStepRecurrence(*SE) &&
+         "Range and latch should have same step recurrence!");
+
+  return widenICmpRangeCheckIncrementingLoop(CurrLatchCheck, *RangeCheck,
+                                             Expander, Builder);
+}
 
 bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
                                            SCEVExpander &Expander) {
@@ -485,15 +566,6 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
     return None;
   }
 
-  if (Result->Pred != ICmpInst::ICMP_ULT &&
-      Result->Pred != ICmpInst::ICMP_SLT &&
-      Result->Pred != ICmpInst::ICMP_ULE &&
-      Result->Pred != ICmpInst::ICMP_SLE) {
-    DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred
-                 << ")!\n");
-    return None;
-  }
-
   // Check affine first, so if it's not we don't try to compute the step
   // recurrence.
   if (!Result->IV->isAffine()) {
@@ -502,14 +574,55 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
   }
 
   auto *Step = Result->IV->getStepRecurrence(*SE);
-  if (!Step->isOne()) {
+  if (!isSupportedStep(Step)) {
     DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n");
     return None;
   }
 
+  auto IsUnsupportedPredicate = [](const SCEV *Step, ICmpInst::Predicate Pred) {
+    assert(Step->isOne() && "expected Step to be one!");
+    return Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_SLT &&
+           Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_SLE;
+  };
+
+  if (IsUnsupportedPredicate(Step, Result->Pred)) {
+    DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred
+                 << ")!\n");
+    return None;
+  }
   return Result;
 }
 
+// Returns true if its safe to truncate the IV to RangeCheckType.
+bool LoopPredication::isSafeToTruncateWideIVType(Type *RangeCheckType) {
+  if (!EnableIVTruncation)
+    return false;
+  assert(DL->getTypeSizeInBits(LatchCheck.IV->getType()) >
+             DL->getTypeSizeInBits(RangeCheckType) &&
+         "Expected latch check IV type to be larger than range check operand "
+         "type!");
+  // The start and end values of the IV should be known. This is to guarantee
+  // that truncating the wide type will not lose information.
+  auto *Limit = dyn_cast<SCEVConstant>(LatchCheck.Limit);
+  auto *Start = dyn_cast<SCEVConstant>(LatchCheck.IV->getStart());
+  if (!Limit || !Start)
+    return false;
+  // This check makes sure that the IV does not change sign during loop
+  // iterations. Consider latchType = i64, LatchStart = 5, Pred = ICMP_SGE,
+  // LatchEnd = 2, rangeCheckType = i32. If it's not a monotonic predicate, the
+  // IV wraps around, and the truncation of the IV would lose the range of
+  // iterations between 2^32 and 2^64.
+  bool Increasing;
+  if (!SE->isMonotonicPredicate(LatchCheck.IV, LatchCheck.Pred, Increasing))
+    return false;
+  // The active bits should be less than the bits in the RangeCheckType. This
+  // guarantees that truncating the latch check to RangeCheckType is a safe
+  // operation.
+  auto RangeCheckTypeBitSize = DL->getTypeSizeInBits(RangeCheckType);
+  return Start->getAPInt().getActiveBits() < RangeCheckTypeBitSize &&
+         Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize;
+}
+
 bool LoopPredication::runOnLoop(Loop *Loop) {
   L = Loop;
 
@@ -535,6 +648,9 @@ bool LoopPredication::runOnLoop(Loop *Loop) {
     return false;
   LatchCheck = *LatchCheckOpt;
 
+  DEBUG(dbgs() << "Latch check:\n");
+  DEBUG(LatchCheck.dump());
+
   // Collect all the guards into a vector and process later, so as not
   // to invalidate the instruction iterator.
   SmallVector<IntrinsicInst *, 4> Guards;
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index bbb179d3790..7f03f2379e7 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1037,7 +1037,7 @@ struct LSRFixup {
   Value *OperandValToReplace = nullptr;
 
   /// If this user is to use the post-incremented value of an induction
-  /// variable, this variable is non-null and holds the loop associated with the
+  /// variable, this set is non-empty and holds the loops associated with the
   /// induction variable.
   PostIncLoopSet PostIncLoops;
 
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index a44ca333fee..1f32f9f24aa 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -145,8 +145,7 @@ XorOpnd::XorOpnd(Value *V) {
 static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
   if (V->hasOneUse() && isa<Instruction>(V) &&
       cast<Instruction>(V)->getOpcode() == Opcode &&
-      (!isa<FPMathOperator>(V) ||
-       cast<Instruction>(V)->hasUnsafeAlgebra()))
+      (!isa<FPMathOperator>(V) || cast<Instruction>(V)->isFast()))
     return cast<BinaryOperator>(V);
   return nullptr;
 }
@@ -156,8 +155,7 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
   if (V->hasOneUse() && isa<Instruction>(V) &&
       (cast<Instruction>(V)->getOpcode() == Opcode1 ||
        cast<Instruction>(V)->getOpcode() == Opcode2) &&
-      (!isa<FPMathOperator>(V) ||
-       cast<Instruction>(V)->hasUnsafeAlgebra()))
+      (!isa<FPMathOperator>(V) || cast<Instruction>(V)->isFast()))
     return cast<BinaryOperator>(V);
   return nullptr;
 }
@@ -565,7 +563,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
       assert((!isa<Instruction>(Op) ||
               cast<Instruction>(Op)->getOpcode() != Opcode
               || (isa<FPMathOperator>(Op) &&
-                  !cast<Instruction>(Op)->hasUnsafeAlgebra())) &&
+                  !cast<Instruction>(Op)->isFast())) &&
              "Should have been handled above!");
       assert(Op->hasOneUse() && "Has uses outside the expression tree!");
 
@@ -2017,8 +2015,8 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
   if (I->isCommutative())
     canonicalizeOperands(I);
 
-  // Don't optimize floating point instructions that don't have unsafe algebra.
-  if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra())
+  // Don't optimize floating-point instructions unless they are 'fast'.
+  if (I->getType()->isFPOrFPVectorTy() && !I->isFast())
     return;
 
   // Do not reassociate boolean (i1) expressions.  We want to preserve the
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 1ca77cfec32..44acfc88579 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -125,10 +125,10 @@ struct RewriteStatepointsForGC : public ModulePass {
       Changed |= runOnFunction(F);
 
     if (Changed) {
-      // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn
+      // stripNonValidData asserts that shouldRewriteStatepointsIn
       // returns true for at least one function in the module.  Since at least
       // one function changed, we know that the precondition is satisfied.
-      stripNonValidAttributesAndMetadata(M);
+      stripNonValidData(M);
     }
 
     return Changed;
@@ -146,15 +146,17 @@ struct RewriteStatepointsForGC : public ModulePass {
   /// metadata implying dereferenceability that are no longer valid/correct after
   /// RewriteStatepointsForGC has run. This is because semantically, after
   /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
-  /// heap. stripNonValidAttributesAndMetadata (conservatively) restores
+  /// heap. stripNonValidData (conservatively) restores
   /// correctness by erasing all attributes in the module that externally imply
   /// dereferenceability. Similar reasoning also applies to the noalias
   /// attributes and metadata. gc.statepoint can touch the entire heap including
   /// noalias objects.
-  void stripNonValidAttributesAndMetadata(Module &M);
+  /// Apart from attributes and metadata, we also remove instructions that imply
+  /// constant physical memory: llvm.invariant.start.
+  void stripNonValidData(Module &M);
 
-  // Helpers for stripNonValidAttributesAndMetadata
-  void stripNonValidAttributesAndMetadataFromBody(Function &F);
+  // Helpers for stripNonValidData
+  void stripNonValidDataFromBody(Function &F);
   void stripNonValidAttributesFromPrototype(Function &F);
 
   // Certain metadata on instructions are invalid after running RS4GC.
@@ -2385,14 +2387,30 @@ void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I
   I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC);
 }
 
-void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) {
+void RewriteStatepointsForGC::stripNonValidDataFromBody(Function &F) {
   if (F.empty())
     return;
 
   LLVMContext &Ctx = F.getContext();
   MDBuilder Builder(Ctx);
 
+  // Set of invariantstart instructions that we need to remove.
+  // Use this to avoid invalidating the instruction iterator.
+  SmallVector<IntrinsicInst*, 12> InvariantStartInstructions;
+
   for (Instruction &I : instructions(F)) {
+    // invariant.start on memory location implies that the referenced memory
+    // location is constant and unchanging. This is no longer true after
+    // RewriteStatepointsForGC runs because there can be calls to gc.statepoint
+    // which frees the entire heap and the presence of invariant.start allows
+    // the optimizer to sink the load of a memory location past a statepoint,
+    // which is incorrect.
+    if (auto *II = dyn_cast<IntrinsicInst>(&I))
+      if (II->getIntrinsicID() == Intrinsic::invariant_start) {
+        InvariantStartInstructions.push_back(II);
+        continue;
+      }
+
     if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) {
       assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!");
       bool IsImmutableTBAA =
@@ -2422,6 +2440,12 @@ void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Functio
         RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex);
     }
   }
+
+  // Delete the invariant.start instructions and RAUW undef.
+  for (auto *II : InvariantStartInstructions) {
+    II->replaceAllUsesWith(UndefValue::get(II->getType()));
+    II->eraseFromParent();
+  }
 }
 
 /// Returns true if this function should be rewritten by this pass.  The main
@@ -2438,7 +2462,7 @@ static bool shouldRewriteStatepointsIn(Function &F) {
     return false;
 }
 
-void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) {
+void RewriteStatepointsForGC::stripNonValidData(Module &M) {
 #ifndef NDEBUG
   assert(llvm::any_of(M, shouldRewriteStatepointsIn) && "precondition!");
 #endif
@@ -2447,7 +2471,7 @@ void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) {
     stripNonValidAttributesFromPrototype(F);
 
   for (Function &F : M)
-    stripNonValidAttributesAndMetadataFromBody(F);
+    stripNonValidDataFromBody(F);
 }
 
 bool RewriteStatepointsForGC::runOnFunction(Function &F) {
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index b968cb8c892..6de6c8cce2c 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -4133,8 +4133,10 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
                  "new fragment is outside of original fragment");
           Start -= OrigFragment->OffsetInBits;
         }
-        FragmentExpr =
-            DIExpression::createFragmentExpression(Expr, Start, Size);
+        if (auto E = DIExpression::createFragmentExpression(Expr, Start, Size))
+          FragmentExpr = *E;
+        else
+          continue;
       }
 
       // Remove any existing intrinsics describing the same alloca.
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index c1034ace206..8a5ae1b8731 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -35,6 +35,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeADCELegacyPassPass(Registry);
   initializeBDCELegacyPassPass(Registry);
   initializeAlignmentFromAssumptionsPass(Registry);
+  initializeCallSiteSplittingLegacyPassPass(Registry);
   initializeConstantHoistingLegacyPassPass(Registry);
   initializeConstantPropagationPass(Registry);
   initializeCorrelatedValuePropagationPass(Registry);
diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp
index fbb61ac1ae9..2e6fc4e8482 100644
--- a/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -203,6 +203,23 @@ FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV,
 }
 
 void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
+
+  // Check the summaries to see if the symbol gets resolved to a known local
+  // definition.
+  if (GV.hasName()) {
+    ValueInfo VI = ImportIndex.getValueInfo(GV.getGUID());
+    if (VI) {
+      // Need to check all summaries are local in case of hash collisions.
+      bool IsLocal = VI.getSummaryList().size() &&
+          llvm::all_of(VI.getSummaryList(),
+                       [](const std::unique_ptr<GlobalValueSummary> &Summary) {
+                         return Summary->isDSOLocal();
+                       });
+      if (IsLocal)
+        GV.setDSOLocal(true);
+    }
+  }
+
   bool DoPromote = false;
   if (GV.hasLocalLinkage() &&
       ((DoPromote = shouldPromoteLocalToGlobal(&GV)) || isPerformingImport())) {
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 8c643c93ec4..89dbe4b8fda 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -1362,16 +1362,25 @@ void llvm::salvageDebugInfo(Instruction &I) {
   SmallVector<DbgValueInst *, 1> DbgValues;
   auto &M = *I.getModule();
 
-  auto MDWrap = [&](Value *V) {
+  auto wrapMD = [&](Value *V) {
     return MetadataAsValue::get(I.getContext(), ValueAsMetadata::get(V));
   };
 
-  if (isa<BitCastInst>(&I)) {
+  auto applyOffset = [&](DbgValueInst *DVI, uint64_t Offset) {
+    auto *DIExpr = DVI->getExpression();
+    DIExpr = DIExpression::prepend(DIExpr, DIExpression::NoDeref, Offset,
+                                   DIExpression::WithStackValue);
+    DVI->setOperand(0, wrapMD(I.getOperand(0)));
+    DVI->setOperand(2, MetadataAsValue::get(I.getContext(), DIExpr));
+    DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
+  };
+
+  if (isa<BitCastInst>(&I) || isa<IntToPtrInst>(&I)) {
     findDbgValues(DbgValues, &I);
     for (auto *DVI : DbgValues) {
       // Bitcasts are entirely irrelevant for debug info. Rewrite the dbg.value
       // to use the cast's source.
-      DVI->setOperand(0, MDWrap(I.getOperand(0)));
+      DVI->setOperand(0, wrapMD(I.getOperand(0)));
       DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
     }
   } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
@@ -1383,24 +1392,26 @@ void llvm::salvageDebugInfo(Instruction &I) {
       // Rewrite a constant GEP into a DIExpression.  Since we are performing
       // arithmetic to compute the variable's *value* in the DIExpression, we
       // need to mark the expression with a DW_OP_stack_value.
-      if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset)) {
-        auto *DIExpr = DVI->getExpression();
+      if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset))
         // GEP offsets are i32 and thus always fit into an int64_t.
-        DIExpr = DIExpression::prepend(DIExpr, DIExpression::NoDeref,
-                                       Offset.getSExtValue(),
-                                       DIExpression::WithStackValue);
-        DVI->setOperand(0, MDWrap(I.getOperand(0)));
-        DVI->setOperand(2, MetadataAsValue::get(I.getContext(), DIExpr));
-        DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
-      }
+        applyOffset(DVI, Offset.getSExtValue());
     }
+  } else if (auto *BI = dyn_cast<BinaryOperator>(&I)) {
+    if (BI->getOpcode() == Instruction::Add)
+      if (auto *ConstInt = dyn_cast<ConstantInt>(I.getOperand(1)))
+        if (ConstInt->getBitWidth() <= 64) {
+          APInt Offset = ConstInt->getValue();
+          findDbgValues(DbgValues, &I);
+          for (auto *DVI : DbgValues)
+            applyOffset(DVI, Offset.getSExtValue());
+        }
   } else if (isa<LoadInst>(&I)) {
     findDbgValues(DbgValues, &I);
     for (auto *DVI : DbgValues) {
       // Rewrite the load into DW_OP_deref.
       auto *DIExpr = DVI->getExpression();
       DIExpr = DIExpression::prepend(DIExpr, DIExpression::WithDeref);
-      DVI->setOperand(0, MDWrap(I.getOperand(0)));
+      DVI->setOperand(0, wrapMD(I.getOperand(0)));
       DVI->setOperand(2, MetadataAsValue::get(I.getContext(), DIExpr));
       DEBUG(dbgs() << "SALVAGE:  " << *DVI << '\n');
     }
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index 13c0bfbcb2e..0de6924e635 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -432,7 +432,7 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
                                         InstDesc &Prev, bool HasFunNoNaNAttr) {
   bool FP = I->getType()->isFloatingPointTy();
   Instruction *UAI = Prev.getUnsafeAlgebraInst();
-  if (!UAI && FP && !I->hasUnsafeAlgebra())
+  if (!UAI && FP && !I->isFast())
     UAI = I; // Found an unsafe (unvectorizable) algebra instruction.
 
   switch (I->getOpcode()) {
@@ -660,11 +660,11 @@ Value *RecurrenceDescriptor::createMinMaxOp(IRBuilder<> &Builder,
     break;
   }
 
-  // We only match FP sequences with unsafe algebra, so we can unconditionally
+  // We only match FP sequences that are 'fast', so we can unconditionally
   // set it on any generated instructions.
   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
   FastMathFlags FMF;
-  FMF.setUnsafeAlgebra();
+  FMF.setFast();
   Builder.setFastMathFlags(FMF);
 
   Value *Cmp;
@@ -768,7 +768,7 @@ Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index,
 
     // Floating point operations had to be 'fast' to enable the induction.
     FastMathFlags Flags;
-    Flags.setUnsafeAlgebra();
+    Flags.setFast();
 
     Value *MulExp = B.CreateFMul(StepValue, Index);
     if (isa<Instruction>(MulExp))
@@ -1338,7 +1338,7 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
 static Value *addFastMathFlag(Value *V) {
   if (isa<FPMathOperator>(V)) {
     FastMathFlags Flags;
-    Flags.setUnsafeAlgebra();
+    Flags.setFast();
     cast<Instruction>(V)->setFastMathFlags(Flags);
   }
   return V;
@@ -1401,7 +1401,7 @@ Value *llvm::createSimpleTargetReduction(
   RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid;
   // TODO: Support creating ordered reductions.
   FastMathFlags FMFUnsafe;
-  FMFUnsafe.setUnsafeAlgebra();
+  FMFUnsafe.setFast();
 
   switch (Opcode) {
   case Instruction::Add:
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 3c4dae92ebf..e0045e9f48a 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2901,7 +2901,9 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
       else
         return false;
     }
-    return N <= PHINodeFoldingThreshold;
+    // The store we want to merge is counted in N, so add 1 to make sure
+    // we're counting the instructions that would be left.
+    return N <= (PHINodeFoldingThreshold + 1);
   };
 
   if (!MergeCondStoresAggressively &&
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 33117659489..a29b83717f3 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1111,7 +1111,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
   // Example: x = 1000, y = 0.001.
   // pow(exp(x), y) = pow(inf, 0.001) = inf, whereas exp(x*y) = exp(1).
   auto *OpC = dyn_cast<CallInst>(Op1);
-  if (OpC && OpC->hasUnsafeAlgebra() && CI->hasUnsafeAlgebra()) {
+  if (OpC && OpC->isFast() && CI->isFast()) {
     LibFunc Func;
     Function *OpCCallee = OpC->getCalledFunction();
     if (OpCCallee && TLI->getLibFunc(OpCCallee->getName(), Func) &&
@@ -1136,7 +1136,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
                       LibFunc_sqrtl)) {
     // If -ffast-math:
     // pow(x, -0.5) -> 1.0 / sqrt(x)
-    if (CI->hasUnsafeAlgebra()) {
+    if (CI->isFast()) {
       IRBuilder<>::FastMathFlagGuard Guard(B);
       B.setFastMathFlags(CI->getFastMathFlags());
 
@@ -1157,7 +1157,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
                       LibFunc_sqrtl)) {
 
     // In -ffast-math, pow(x, 0.5) -> sqrt(x).
-    if (CI->hasUnsafeAlgebra()) {
+    if (CI->isFast()) {
       IRBuilder<>::FastMathFlagGuard Guard(B);
       B.setFastMathFlags(CI->getFastMathFlags());
 
@@ -1196,7 +1196,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip");
 
   // In -ffast-math, generate repeated fmul instead of generating pow(x, n).
-  if (CI->hasUnsafeAlgebra()) {
+  if (CI->isFast()) {
     APFloat V = abs(Op2C->getValueAPF());
     // We limit to a max of 7 fmul(s). Thus max exponent is 32.
     // This transformation applies to integer exponents only.
@@ -1284,9 +1284,9 @@ Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
 
   IRBuilder<>::FastMathFlagGuard Guard(B);
   FastMathFlags FMF;
-  if (CI->hasUnsafeAlgebra()) {
-    // Unsafe algebra sets all fast-math-flags to true.
-    FMF.setUnsafeAlgebra();
+  if (CI->isFast()) {
+    // If the call is 'fast', then anything we create here will also be 'fast'.
+    FMF.setFast();
   } else {
     // At a minimum, no-nans-fp-math must be true.
     if (!CI->hasNoNaNs())
@@ -1317,13 +1317,13 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
   if (UnsafeFPShrink && hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
 
-  if (!CI->hasUnsafeAlgebra())
+  if (!CI->isFast())
     return Ret;
   Value *Op1 = CI->getArgOperand(0);
   auto *OpC = dyn_cast<CallInst>(Op1);
 
-  // The earlier call must also be unsafe in order to do these transforms.
-  if (!OpC || !OpC->hasUnsafeAlgebra())
+  // The earlier call must also be 'fast' in order to do these transforms.
+  if (!OpC || !OpC->isFast())
     return Ret;
 
   // log(pow(x,y)) -> y*log(x)
@@ -1333,7 +1333,7 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
 
   IRBuilder<>::FastMathFlagGuard Guard(B);
   FastMathFlags FMF;
-  FMF.setUnsafeAlgebra();
+  FMF.setFast();
   B.setFastMathFlags(FMF);
 
   LibFunc Func;
@@ -1365,11 +1365,11 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
                                   Callee->getIntrinsicID() == Intrinsic::sqrt))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
 
-  if (!CI->hasUnsafeAlgebra())
+  if (!CI->isFast())
     return Ret;
 
   Instruction *I = dyn_cast<Instruction>(CI->getArgOperand(0));
-  if (!I || I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra())
+  if (!I || I->getOpcode() != Instruction::FMul || !I->isFast())
     return Ret;
 
   // We're looking for a repeated factor in a multiplication tree,
@@ -1391,8 +1391,7 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
     Value *OtherMul0, *OtherMul1;
     if (match(Op0, m_FMul(m_Value(OtherMul0), m_Value(OtherMul1)))) {
       // Pattern: sqrt((x * y) * z)
-      if (OtherMul0 == OtherMul1 &&
-          cast<Instruction>(Op0)->hasUnsafeAlgebra()) {
+      if (OtherMul0 == OtherMul1 && cast<Instruction>(Op0)->isFast()) {
         // Matched: sqrt((x * x) * z)
         RepeatOp = OtherMul0;
         OtherOp = Op1;
@@ -1437,8 +1436,8 @@ Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilder<> &B) {
   if (!OpC)
     return Ret;
 
-  // Both calls must allow unsafe optimizations in order to remove them.
-  if (!CI->hasUnsafeAlgebra() || !OpC->hasUnsafeAlgebra())
+  // Both calls must be 'fast' in order to remove them.
+  if (!CI->isFast() || !OpC->isFast())
     return Ret;
 
   // tan(atan(x)) -> x
@@ -2167,10 +2166,10 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
 
   // Command-line parameter overrides instruction attribute.
   // This can't be moved to optimizeFloatingPointLibCall() because it may be
-  // used by the intrinsic optimizations. 
+  // used by the intrinsic optimizations.
   if (EnableUnsafeFPShrink.getNumOccurrences() > 0)
     UnsafeFPShrink = EnableUnsafeFPShrink;
-  else if (isa<FPMathOperator>(CI) && CI->hasUnsafeAlgebra())
+  else if (isa<FPMathOperator>(CI) && CI->isFast())
     UnsafeFPShrink = true;
 
   // First, check for intrinsics.
diff --git a/lib/Transforms/Utils/SplitModule.cpp b/lib/Transforms/Utils/SplitModule.cpp
index 07157069518..934a1bd73c2 100644
--- a/lib/Transforms/Utils/SplitModule.cpp
+++ b/lib/Transforms/Utils/SplitModule.cpp
@@ -141,15 +141,15 @@ static void findPartitions(Module *M, ClusterIDMapType &ClusterIDMap,
     }
 
     if (GV.hasLocalLinkage())
-      addAllGlobalValueUsers(GVtoClusterMap, &GV, &GV);
-  };
-
-  std::for_each(M->begin(), M->end(), recordGVSet);
-  std::for_each(M->global_begin(), M->global_end(), recordGVSet);
-  std::for_each(M->alias_begin(), M->alias_end(), recordGVSet);
-
-  // Assigned all GVs to merged clusters while balancing number of objects in
-  // each.
+      addAllGlobalValueUsers(GVtoClusterMap, &GV, &GV);
+  };
+
+  llvm::for_each(M->functions(), recordGVSet);
+  llvm::for_each(M->globals(), recordGVSet);
+  llvm::for_each(M->aliases(), recordGVSet);
+
+  // Assigned all GVs to merged clusters while balancing number of objects in
+  // each.
   auto CompareClusters = [](const std::pair<unsigned, unsigned> &a,
                             const std::pair<unsigned, unsigned> &b) {
     if (a.second || b.second)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ca2f5a178e0..ed29ca0b573 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -385,7 +385,7 @@ static unsigned getReciprocalPredBlockProb() { return 2; }
 static Value *addFastMathFlag(Value *V) {
   if (isa<FPMathOperator>(V)) {
     FastMathFlags Flags;
-    Flags.setUnsafeAlgebra();
+    Flags.setFast();
     cast<Instruction>(V)->setFastMathFlags(Flags);
   }
   return V;
@@ -2720,7 +2720,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
 
   // Floating point operations had to be 'fast' to enable the induction.
   FastMathFlags Flags;
-  Flags.setUnsafeAlgebra();
+  Flags.setFast();
 
   Value *MulOp = Builder.CreateFMul(Cv, Step);
   if (isa<Instruction>(MulOp))
@@ -5396,7 +5396,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // operations, shuffles, or casts, as they don't change precision or
         // semantics.
       } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
-                 !I.hasUnsafeAlgebra()) {
+                 !I.isFast()) {
         DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
         Hints->setPotentiallyUnsafe();
       }
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5dcf5528ac9..4232252af36 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4880,7 +4880,7 @@ class HorizontalReduction {
       case RK_Min:
       case RK_Max:
         return Opcode == Instruction::ICmp ||
-               cast<Instruction>(I->getOperand(0))->hasUnsafeAlgebra();
+               cast<Instruction>(I->getOperand(0))->isFast();
       case RK_UMin:
       case RK_UMax:
         assert(Opcode == Instruction::ICmp &&
@@ -5232,7 +5232,7 @@ public:
     Value *VectorizedTree = nullptr;
     IRBuilder<> Builder(ReductionRoot);
     FastMathFlags Unsafe;
-    Unsafe.setUnsafeAlgebra();
+    Unsafe.setFast();
     Builder.setFastMathFlags(Unsafe);
     unsigned i = 0;