Integrate llvm-project at 7cb4c2617391b80993e7c10f3a34c9e172f7ad41 (#8190)

* Reset third_party/llvm-project: 7cb4c2617391b80993e7c10f3a34c9e172f7ad41 (2022-01-25 20:53:45 -0500): [OMPIRBuilder] Generate aggregate argument for parallel region outlined functions

* Update mlir-hlo to match the LLVM commit

* Remove mlir-hlo.branch-pin given we track an upstream commit now

* Update TensorFlow integrations to match the LLVM commit, including local changes:
  * Disables inconsistent-missing-override, which TensorFlow sources do not comply with.
  * Upgrades Identifier -> StringAttr.

* Fix header locations and build dependencies

* Enable passed tests

Co-authored-by: Stella Laurenzo <stellaraccident@gmail.com>
diff --git a/integrations/tensorflow/WORKSPACE b/integrations/tensorflow/WORKSPACE
index c03480a..c2f5d1e 100644
--- a/integrations/tensorflow/WORKSPACE
+++ b/integrations/tensorflow/WORKSPACE
@@ -7,7 +7,7 @@
 
 load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
 
-TENSORFLOW_COMMIT = "a20bfc24dfbc34ef4de644e6bf46b41e6e57b878"
+TENSORFLOW_COMMIT = "7e6137a2bc46a10664fd58c1686719a520e024c2"
 
 git_repository(
     name = "org_tensorflow",
diff --git a/integrations/tensorflow/build_tools/bazel/iree-tf.bazelrc b/integrations/tensorflow/build_tools/bazel/iree-tf.bazelrc
index 929ef04..2c26542 100644
--- a/integrations/tensorflow/build_tools/bazel/iree-tf.bazelrc
+++ b/integrations/tensorflow/build_tools/bazel/iree-tf.bazelrc
@@ -10,3 +10,6 @@
 # Ignore visibility issues in TensorFlow. They are inconsistently applied
 # to the OSS codebase.
 build --nocheck_visibility
+
+# Flags specific for working around tensorflow warnings.
+build:generic_clang --copt=-Wno-inconsistent-missing-override --host_copt=-Wno-inconsistent-missing-override
diff --git a/integrations/tensorflow/iree-dialects/BUILD b/integrations/tensorflow/iree-dialects/BUILD
index b936e2c..28fe907 100644
--- a/integrations/tensorflow/iree-dialects/BUILD
+++ b/integrations/tensorflow/iree-dialects/BUILD
@@ -279,6 +279,7 @@
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TensorUtils",
         "@llvm-project//mlir:ViewLikeInterface",
     ],
 )
@@ -310,6 +311,7 @@
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TensorUtils",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
@@ -354,6 +356,7 @@
         ":TdFiles",
         "@llvm-project//mlir:CallInterfacesTdFiles",
         "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
+        "@llvm-project//mlir:FunctionInterfacesTdFiles",
     ],
 )
 
@@ -427,6 +430,7 @@
         ":TdFiles",
         "@llvm-project//mlir:CallInterfacesTdFiles",
         "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
+        "@llvm-project//mlir:FunctionInterfacesTdFiles",
     ],
 )
 
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td
index 062e0d6..f149e68 100644
--- a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td
@@ -256,7 +256,7 @@
     Computes the inclusive/exclusive scan along a given dimension.
   }];
 
-  let arguments = (ins Variadic<AnyType>:$inputs,
+  let arguments = (ins Variadic<AnyShaped>:$inputs,
                        Variadic<AnyShaped>:$outputs,
                        I64Attr:$dimension,
                        BoolAttr:$inclusive
@@ -269,12 +269,13 @@
 
   let results = (outs Variadic<AnyRankedTensor>:$results);
   let regions = (region AnyRegion:$region);
+  let hasFolder = 1;
   let assemblyFormat = [{
     `dimension` `(` $dimension `)`
     `inclusive` `(` $inclusive `)`
     attr-dict
     `ins` `(` $inputs `:` type($inputs) `)`
-    (`outs` `(` $outputs^ `:` type($outputs) `)`)?
+    `outs` `(` $outputs `:` type($outputs) `)`
     $region (`->` type($results)^)?
   }];
 
@@ -282,8 +283,8 @@
     Value input() {
       return getInputOperand(0)->get();
     }
-    Value identity() {
-      return getInputOperand(1)->get();
+    Value accumulator() {
+      return getOutputOperand(1)->get();
     }
     Value output() {
       return getOutputOperand(0)->get();
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMOps.td b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMOps.td
index ad82868..cce9fb0 100644
--- a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMOps.td
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/IR/PyDMOps.td
@@ -8,6 +8,7 @@
 #define IREE_DIALECTS_DIALECT_PYDM_IR_PYDM_OPS_TD
 
 include "iree-dialects/Dialect/PyDM/IR/PyDMDialect.td"
+include "mlir/IR/FunctionInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
@@ -98,7 +99,7 @@
 // cell_vars attributes (with better modeling, they may not be needed at all).
 def IREEPyDM_FuncOp : IREEPyDM_Op<"func", [
     IsolatedFromAbove,
-    FunctionLike,
+    FunctionOpInterface,
     CallableOpInterface,
     Symbol,
     DeclareOpInterfaceMethods<OpAsmOpInterface, ["getDefaultDialect"]>]> {
@@ -147,20 +148,17 @@
           .cast<FunctionType>();
     }
 
+    /// Returns the argument types of this function.
+    ArrayRef<Type> getArgumentTypes() { return getType().getInputs(); }
+
+    /// Returns the result types of this function.
+    ArrayRef<Type> getResultTypes() { return getType().getResults(); }
+
     /// Returns the python return type of the function (second return type).
     Type getPyReturnType() {
       return getType().getResult(1);
     }
 
-    /// Hook for OpTrait::FunctionLike, returns the number of function
-    /// arguments. Depends on the type attribute being correct as checked by
-    /// verifyType.
-    unsigned getNumFuncArguments() { return getType().getInputs().size(); }
-
-    /// Hook for OpTrait::FunctionLike, returns the number of function results.
-    /// Depends on the type attribute being correct as checked by verifyType.
-    unsigned getNumFuncResults() { return getType().getResults().size(); }
-
     /// Hook for OpTrait::FunctionLike, called after verifying that the 'type'
     /// attribute is present. This can check for preconditions of the
     /// getNumArguments hook not failing.
diff --git a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/Utils/TypeInference.h b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/Utils/TypeInference.h
index cdc628b..aec96be 100644
--- a/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/Utils/TypeInference.h
+++ b/integrations/tensorflow/iree-dialects/include/iree-dialects/Dialect/PyDM/Utils/TypeInference.h
@@ -76,7 +76,7 @@
 
   /// Creates a new block permutation. The initialize callback must populate
   /// the mapping for all original arguments.
-  Block *createBlockPermutation(ParentBlockInfo *parentInfo,
+  Block *createBlockPermutation(Location loc, ParentBlockInfo *parentInfo,
                                 TypeRange newArgumentTypes,
                                 BlockPermuteCallback initializeCallback);
 
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/Input/InputDialect.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/Input/InputDialect.cpp
index ef6a95a..060d308 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/Input/InputDialect.cpp
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/Input/InputDialect.cpp
@@ -29,15 +29,3 @@
 #include "iree-dialects/Dialect/Input/InputOps.cpp.inc"
       >();
 }
-
-Type IREEInputDialect::parseType(DialectAsmParser &parser) const {
-  StringRef typeTag;
-  Type genType;
-  if (succeeded(parser.parseKeyword(&typeTag)))
-    generatedTypeParser(parser, typeTag, genType);
-  return genType;
-}
-
-void IREEInputDialect::printType(Type type, DialectAsmPrinter &printer) const {
-  (void)generatedTypePrinter(type, printer);
-}
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/IR/LinalgExtOps.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/IR/LinalgExtOps.cpp
index 412f534..248904e 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/IR/LinalgExtOps.cpp
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/IR/LinalgExtOps.cpp
@@ -772,33 +772,46 @@
 //===----------------------------------------------------------------------===//
 
 static LogicalResult verifyScanOp(ScanOp op) {
-  if (op.getNumInputs() != 2) {
-    return op.emitOpError("expected two input operands");
+  if (op.getNumInputs() != 1) {
+    return op.emitOpError("expected one input operands");
   }
-  if (op.getNumOutputs() != 1) {
-    return op.emitOpError("expected one output operand");
+  if (op.getNumOutputs() != 2) {
+    return op.emitOpError("expected two output operands");
   }
   if (!op.input().getType().isa<ShapedType>()) {
     return op.emitOpError("expected first input element type to be shaped");
   }
-  auto identityElementType = op.identity().getType();
-  if (!(identityElementType.isa<FloatType>() ||
-        identityElementType.isa<IntegerType>())) {
-    return op.emitOpError(
-        "expected second input element type to be float or integer");
-  }
+  auto accumulatorType = op.accumulator().getType().cast<ShapedType>();
   auto inputType = op.input().getType().cast<ShapedType>();
   auto outputType = op.output().getType().cast<ShapedType>();
-  if (identityElementType != inputType.getElementType()) {
+  ArrayRef<int64_t> inputShapes = inputType.getShape();
+  ArrayRef<int64_t> outputShapes = outputType.getShape();
+  if (accumulatorType.getElementType() != inputType.getElementType()) {
     return op.emitOpError(
-        "expected input/identity element types to be identical");
+        "expected input/accumulator element types to be identical");
+  }
+  ArrayRef<int64_t> accumulatorShape = accumulatorType.getShape();
+  int64_t accumulatorRank = accumulatorType.getRank();
+  if (accumulatorRank != inputType.getRank() - 1) {
+    return op.emitOpError(
+        "expected accumulator rank to be equal to input rank - 1");
+  }
+  SmallVector<int64_t> expectedAccumulatorShape;
+  for (int i = 0; i < inputType.getRank(); i++) {
+    if (i != op.dimension()) expectedAccumulatorShape.push_back(inputShapes[i]);
+  }
+  if (llvm::any_of(llvm::zip(expectedAccumulatorShape, accumulatorShape),
+                   [](std::tuple<int64_t, int64_t> s) {
+                     return std::get<0>(s) != ShapedType::kDynamicSize &&
+                            std::get<1>(s) != ShapedType::kDynamicSize &&
+                            std::get<0>(s) != std::get<1>(s);
+                   })) {
+    return op.emitOpError("incompatible input/accumulator shapes");
   }
   if (inputType.getElementType() != outputType.getElementType()) {
     return op.emitOpError(
         "expected input/output element types to be identical");
   }
-  ArrayRef<int64_t> inputShapes = inputType.getShape();
-  ArrayRef<int64_t> outputShapes = outputType.getShape();
   if (inputShapes.size() != outputShapes.size()) {
     return op.emitOpError("expected input/output to have identical ranks");
   }
@@ -862,6 +875,11 @@
   auto cond = b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
                                       indices[scanDim], zero);
   bool isInclusive = inclusive();
+  SmallVector<Value> accIndices;
+  for (int i = 0; i < indices.size(); i++) {
+    if (i != scanDim) accIndices.push_back(indices[i]);
+  }
+
   auto scfIf = b.create<scf::IfOp>(
       loc, TypeRange{}, cond,
       [&](OpBuilder &b, Location loc) {
@@ -869,7 +887,8 @@
           auto value = b.create<memref::LoadOp>(loc, input(), indices);
           b.create<memref::StoreOp>(loc, value, output(), indices);
         } else {
-          b.create<memref::StoreOp>(loc, identity(), output(), indices);
+          auto value = b.create<memref::LoadOp>(loc, accumulator(), accIndices);
+          b.create<memref::StoreOp>(loc, value, output(), indices);
         }
         b.create<scf::YieldOp>(loc);
       },
@@ -902,6 +921,9 @@
     b.create<memref::StoreOp>(
         loc, bvm.lookupOrDefault(srcBlock.getTerminator()->getOperand(0)),
         output(), indices);
+    b.create<memref::StoreOp>(
+        loc, bvm.lookupOrDefault(srcBlock.getTerminator()->getOperand(0)),
+        accumulator(), accIndices);
     b.create<scf::YieldOp>(loc);
   }
   return success();
@@ -922,18 +944,37 @@
   SmallVector<Value> tiledOperands;
   tiledOperands.emplace_back(
       getSlice(builder, getLoc(), input(), offsets, sizes, strides));
-  tiledOperands.emplace_back(identity());
   tiledOperands.emplace_back(
-      getSlice(builder, getLoc(), output(), offsets, sizes, strides));
+      getSlice(builder, getLoc(), outputs[0], offsets, sizes, strides));
+  SmallVector<OpFoldResult> accumOffsets, accumSizes, accumStrides;
+  if (rank > 1) {
+    for (int i = 0; i < rank; i++) {
+      if (i != dimension()) {
+        accumOffsets.push_back(offsets[i]);
+        accumSizes.push_back(sizes[i]);
+        accumStrides.push_back(strides[i]);
+      }
+    }
+    tiledOperands.emplace_back(getSlice(
+        builder, getLoc(), outputs[1], accumOffsets, accumSizes, accumStrides));
+  } else {
+    tiledOperands.emplace_back(outputs[1]);
+  }
 
   SmallVector<Type, 4> resultTypes;
   if (hasTensorSemantics()) {
+    resultTypes.push_back(tiledOperands[1].getType());
     resultTypes.push_back(tiledOperands[2].getType());
   }
 
   Operation *tiledScanOp = cast<LinalgExtOp>(getOperation())
                                .clone(builder, loc, resultTypes, tiledOperands);
   for (auto result : llvm::enumerate(tiledScanOp->getResults())) {
+    if ((result.index() == resultTypes.size() - 1) && (rank > 1)) {
+      offsets = accumOffsets;
+      sizes = accumSizes;
+      strides = accumStrides;
+    }
     auto insertSliceOp = builder.create<tensor::InsertSliceOp>(
         loc, result.value(), outputs[result.index()], offsets, sizes, strides);
     results.push_back(insertSliceOp.getResult());
@@ -941,6 +982,23 @@
   return tiledScanOp;
 }
 
+static LogicalResult foldMemRefCast(Operation *op) {
+  bool folded = false;
+  for (OpOperand &operand : op->getOpOperands()) {
+    auto castOp = operand.get().getDefiningOp<memref::CastOp>();
+    if (castOp && memref::CastOp::canFoldIntoConsumerOp(castOp)) {
+      operand.set(castOp.getOperand());
+      folded = true;
+    }
+  }
+  return success(folded);
+}
+
+LogicalResult ScanOp::fold(ArrayRef<Attribute>,
+                           SmallVectorImpl<OpFoldResult> &) {
+  return foldMemRefCast(*this);
+}
+
 //===----------------------------------------------------------------------===//
 // ReverseOp
 //===----------------------------------------------------------------------===//
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/IR/TiledOpInterface.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/IR/TiledOpInterface.cpp
index 617cbb4..b9b1f35 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/IR/TiledOpInterface.cpp
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/IR/TiledOpInterface.cpp
@@ -307,6 +307,7 @@
   registry
       .addOpInterface<tensor::ExtractSliceOp, ExtractSliceTiledOpInterface>();
   registry.addOpInterface<tensor::InsertSliceOp, InsertSliceTiledOpInterface>();
-  registry.addOpInterface<linalg::PadTensorOp,
-                          ForwardToTilingInterface<linalg::PadTensorOp>>();
+  // TODO(ravishankarm): Needs custom PadTiledOpInterface or equiv.
+  // registry.addOpInterface<tensor::PadOp,
+  //                         ForwardToTilingInterface<tensor::PadOp>>();
 }
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/PadContractionToBlockSize.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/PadContractionToBlockSize.cpp
index 7a5771a..b050cc7 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/PadContractionToBlockSize.cpp
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/LinalgExt/Transforms/PadContractionToBlockSize.cpp
@@ -11,6 +11,7 @@
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Utils/Utils.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -84,7 +85,7 @@
   SmallVector<OpFoldResult> zeroStaticLow(shape.size(),
                                           builder.getI64IntegerAttr(0));
   SmallVector<Value> nullLow;
-  Value padded = linalg::PadTensorOp::createPadScalarOp(
+  Value padded = tensor::createPadScalarOp(
       resultType, operand->get(), zeroConstant, zeroStaticLow, newPaddingSizes,
       false, loc, builder);
   operand->set(padded);
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/IR/PyDMDialect.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/IR/PyDMDialect.cpp
index 5db10f7..ca77eac 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/IR/PyDMDialect.cpp
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/IR/PyDMDialect.cpp
@@ -67,27 +67,6 @@
   llvm_unreachable("unhandled iree_pydm constant materialization");
 }
 
-Type IREEPyDMDialect::parseType(DialectAsmParser &parser) const {
-  StringRef typeTag;
-  if (succeeded(parser.parseKeyword(&typeTag))) {
-    Type genType;
-    auto parseResult = generatedTypeParser(parser, typeTag, genType);
-    if (parseResult.hasValue()) {
-      if (*parseResult) {
-        return Type();
-      }
-      return genType;
-    }
-  }
-
-  parser.emitError(parser.getNameLoc(), "unknown dialect type");
-  return Type();
-}
-
-void IREEPyDMDialect::printType(Type type, DialectAsmPrinter &printer) const {
-  (void)generatedTypePrinter(type, printer);
-}
-
 //------------------------------------------------------------------------------
 // Python type implementation
 //------------------------------------------------------------------------------
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/IR/PyDMOps.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/IR/PyDMOps.cpp
index 4ec61e8..3c87c2d 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/IR/PyDMOps.cpp
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/IR/PyDMOps.cpp
@@ -488,6 +488,7 @@
     // Print yield explicitly if the op defines values.
     printBlockTerminators = true;
   }
+  p << " ";
   p.printRegion(op.thenRegion(),
                 /*printEntryBlockArgs=*/false,
                 /*printBlockTerminators=*/printBlockTerminators);
@@ -495,7 +496,7 @@
   // Print the 'else' regions if it exists and has a block.
   auto &elseRegion = op.elseRegion();
   if (!elseRegion.empty()) {
-    p << " else";
+    p << " else ";
     p.printRegion(elseRegion,
                   /*printEntryBlockArgs=*/false,
                   /*printBlockTerminators=*/printBlockTerminators);
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Transforms/Optimize/LocalPropagateTypes.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Transforms/Optimize/LocalPropagateTypes.cpp
index 4f4a73f..0836591 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Transforms/Optimize/LocalPropagateTypes.cpp
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Transforms/Optimize/LocalPropagateTypes.cpp
@@ -209,7 +209,7 @@
           LLVM_DEBUG(dbgs() << "  -- Creating new permutation for "
                             << mismatch.signature << "\n");
           permutation = propagator.createBlockPermutation(
-              parentInfo, mismatch.signature.getInputs(),
+              loc, parentInfo, mismatch.signature.getInputs(),
               [&](Block *newBlock, Block *origBlock,
                   BlockAndValueMapping &mapping) {
                 OpBuilder builder(newBlock, newBlock->begin());
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Transforms/Optimize/VariablesToSSA.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Transforms/Optimize/VariablesToSSA.cpp
index 587ef76..fc2823d 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Transforms/Optimize/VariablesToSSA.cpp
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Transforms/Optimize/VariablesToSSA.cpp
@@ -173,7 +173,8 @@
       auto loadOp = llvm::cast<LoadVarOp>(genericLoadOp);
       loadVarTypes.emplace_back(loadOp.getLoc(), loadOp.var(),
                                 loadOp.getResult().getType());
-      Value newArg = block.addArgument(loadOp.getResult().getType());
+      Value newArg = block.addArgument(loadOp.getResult().getType(),
+                                       genericLoadOp->getLoc());
       info.blockArgVariableValueMap[loadOp.var()] = newArg;
       loadOp.getResult().replaceAllUsesWith(newArg);
       loadOp->erase();
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Transforms/ToIREE/LoweringPatterns.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Transforms/ToIREE/LoweringPatterns.cpp
index 37a6f9a..9e7a931 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Transforms/ToIREE/LoweringPatterns.cpp
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Transforms/ToIREE/LoweringPatterns.cpp
@@ -344,11 +344,11 @@
         rewriter.getInsertionBlock(), rewriter.getInsertionPoint());
     Block *indexLtZeroBlock = rewriter.createBlock(continuationBlock);
     Block *indexCheckBlock = rewriter.createBlock(continuationBlock);
-    indexCheckBlock->addArgument(indexType);
+    indexCheckBlock->addArgument(indexType, loc);
     Block *setElementBlock = rewriter.createBlock(continuationBlock);
-    setElementBlock->addArgument(indexType);
+    setElementBlock->addArgument(indexType, loc);
     Block *failureBlock = createSlowPathBlock(rewriter);
-    continuationBlock->addArgument(statusType);
+    continuationBlock->addArgument(statusType, loc);
     rewriter.replaceOp(srcOp, continuationBlock->getArguments());
 
     // Comparison index < 0.
@@ -548,8 +548,10 @@
         rewriter.getInsertionBlock(), rewriter.getInsertionPoint());
     Block *arityMatchBlock = rewriter.createBlock(continuationBlock);
     Block *errorBlock = createSlowPathBlock(rewriter);
-    continuationBlock->addArguments(excResultType);
-    continuationBlock->addArguments(slotTypes);
+    continuationBlock->addArgument(excResultType, loc);
+    for (auto slotType : slotTypes) {
+      continuationBlock->addArgument(slotType, loc);
+    }
     rewriter.replaceOp(srcOp, continuationBlock->getArguments());
 
     // Entry block - check arity.
@@ -930,11 +932,11 @@
     Block *continuationBlock = rewriter.splitBlock(
         rewriter.getInsertionBlock(), rewriter.getInsertionPoint());
     Block *outerCond = rewriter.createBlock(continuationBlock);
-    outerCond->addArgument(indexType);
+    outerCond->addArgument(indexType, loc);
     Block *innerCond = rewriter.createBlock(continuationBlock);
-    innerCond->addArguments({indexType, indexType});
+    innerCond->addArguments({indexType, indexType}, {loc, loc});
     Block *innerBody = rewriter.createBlock(continuationBlock);
-    innerBody->addArguments({indexType, indexType});
+    innerBody->addArguments({indexType, indexType}, {loc, loc});
 
     // Entry block.
     {
@@ -1075,11 +1077,11 @@
         rewriter.getInsertionBlock(), rewriter.getInsertionPoint());
     Block *indexLtZeroBlock = rewriter.createBlock(continuationBlock);
     Block *indexCheckBlock = rewriter.createBlock(continuationBlock);
-    indexCheckBlock->addArgument(indexType);
+    indexCheckBlock->addArgument(indexType, loc);
     Block *getElementBlock = rewriter.createBlock(continuationBlock);
-    getElementBlock->addArgument(indexType);
+    getElementBlock->addArgument(indexType, loc);
     Block *failureBlock = createSlowPathBlock(rewriter);
-    continuationBlock->addArguments({statusType, resultType});
+    continuationBlock->addArguments({statusType, resultType}, {loc, loc});
     rewriter.replaceOp(srcOp, continuationBlock->getArguments());
 
     // Comparison index < 0.
@@ -1173,7 +1175,8 @@
         rewriter.getInsertionBlock(), rewriter.getInsertionPoint());
     Block *typesMatchBlock = rewriter.createBlock(continuationBlock);
     Block *slowPathMismatchBlock = createSlowPathBlock(rewriter);
-    continuationBlock->addArguments({statusType, targetUnboxedType});
+    continuationBlock->addArguments({statusType, targetUnboxedType},
+                                    {loc, loc});
     rewriter.replaceOp(srcOp, continuationBlock->getArguments());
 
     // Type code extraction and comparison.
diff --git a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Utils/TypeInference.cpp b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Utils/TypeInference.cpp
index ed09f96..c7212e2 100644
--- a/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Utils/TypeInference.cpp
+++ b/integrations/tensorflow/iree-dialects/lib/Dialect/PyDM/Utils/TypeInference.cpp
@@ -63,11 +63,13 @@
 }
 
 Block *PermutedTypePropagator::createBlockPermutation(
-    ParentBlockInfo *parentInfo, TypeRange newArgumentTypes,
+    Location loc, ParentBlockInfo *parentInfo, TypeRange newArgumentTypes,
     BlockPermuteCallback initializeCallback) {
   Block *parentBlock = parentInfo->parentBlock;
   Block *newBlock = new Block();
-  newBlock->addArguments(newArgumentTypes);
+  for (Type newArgumentType : newArgumentTypes) {
+    newBlock->addArgument(newArgumentType, loc);
+  }
   newBlock->insertBefore(parentBlock);
 
   BlockAndValueMapping mapping;
diff --git a/integrations/tensorflow/iree-dialects/python/iree/compiler/dialects/iree_pydm/rtl/modules/macros.py b/integrations/tensorflow/iree-dialects/python/iree/compiler/dialects/iree_pydm/rtl/modules/macros.py
index 5973070..8f72616 100644
--- a/integrations/tensorflow/iree-dialects/python/iree/compiler/dialects/iree_pydm/rtl/modules/macros.py
+++ b/integrations/tensorflow/iree-dialects/python/iree/compiler/dialects/iree_pydm/rtl/modules/macros.py
@@ -45,7 +45,6 @@
 def unbox_i32(stage: ImportStage, value: ir.Value) -> ir.Value:
   return _unbox_i32(stage, value)
 
-
 def _unbox_i64(stage: ImportStage, value: ir.Value) -> ir.Value:
   i64_type = d.IntegerType.get_explicit(64)
   if d.ObjectType.isinstance(value.type):
diff --git a/integrations/tensorflow/iree-dialects/test/iree_linalgext/convert_to_loops.mlir b/integrations/tensorflow/iree-dialects/test/iree_linalgext/convert_to_loops.mlir
index 0a33dc9..eff3c9b 100644
--- a/integrations/tensorflow/iree-dialects/test/iree_linalgext/convert_to_loops.mlir
+++ b/integrations/tensorflow/iree-dialects/test/iree_linalgext/convert_to_loops.mlir
@@ -507,9 +507,9 @@
 // CHECK:             memref.store %[[V0]], %[[OUT]][%[[T2]], %[[J]]] : memref<?x?xi32>
 
 func @scan_1d_inclusive(%0: memref<128xi32>, %1: memref<128xi32>) {
-  %c0 = arith.constant 0 : i32
+  %c0 = memref.alloc() : memref<i32>
   iree_linalg_ext.scan dimension(0) inclusive(true)
-    ins(%0, %c0 : memref<128xi32>, i32) outs(%1 : memref<128xi32>) {
+    ins(%0 : memref<128xi32>) outs(%1, %c0 : memref<128xi32>, memref<i32>) {
     ^bb0(%arg0 : i32, %arg1 : i32):
       %sum = arith.addi %arg0, %arg1 : i32
       iree_linalg_ext.yield %sum : i32
@@ -522,6 +522,7 @@
 // CHECK-DAG:     %[[C128:.+]] = arith.constant 128 : index
 // CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
 // CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[ACC:.+]] = memref.alloc() : memref<i32>
 // CHECK:         scf.for %[[ARG1:.+]] = %[[C0]] to %[[C128]] step %[[C1]]
 // CHECK:           %[[COND:.+]] = arith.cmpi eq, %[[ARG1]], %[[C0]] : index
 // CHECK:           scf.if %[[COND]] {
@@ -533,14 +534,15 @@
 // CHECK:             %[[V3:.+]] = memref.load %[[BUFI]][%[[ARG1]]]
 // CHECK:             %[[V4:.+]] = arith.addi %[[V2]], %[[V3]] : i32
 // CHECK:             memref.store %[[V4]], %[[BUFO]][%[[ARG1]]]
+// CHECK:             memref.store %[[V4]], %[[ACC]][]
 // CHECK:           }
 
 // -----
 
 func @scan_1d_exclusive(%0: memref<128xi32>, %1: memref<128xi32>) {
-  %c0 = arith.constant 0 : i32
+  %c0 = memref.alloc() : memref<i32>
   iree_linalg_ext.scan dimension(0) inclusive(false)
-    ins(%0, %c0 : memref<128xi32>, i32) outs(%1 : memref<128xi32>) {
+    ins(%0 : memref<128xi32>) outs(%1, %c0 : memref<128xi32>, memref<i32>) {
     ^bb0(%arg0 : i32, %arg1 : i32):
       %sum = arith.addi %arg0, %arg1 : i32
       iree_linalg_ext.yield %sum : i32
@@ -553,25 +555,27 @@
 // CHECK-DAG:     %[[C128:.+]] = arith.constant 128 : index
 // CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
 // CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:     %[[C0_I32:.+]] = arith.constant 0 : i32
+// CHECK-DAG:     %[[ACC:.+]] = memref.alloc() : memref<i32>
 // CHECK:         scf.for %[[ARG1:.+]] = %[[C0]] to %[[C128]] step %[[C1]]
 // CHECK:           %[[COND:.+]] = arith.cmpi eq, %[[ARG1]], %[[C0]] : index
 // CHECK:           scf.if %[[COND]] {
-// CHECK:             memref.store %[[C0_I32]], %[[BUFO]][%[[ARG1]]]
+// CHECK:             %[[V0:.+]] = memref.load %[[ACC]][] : memref<i32>
+// CHECK:             memref.store %[[V0]], %[[BUFO]][%[[ARG1]]]
 // CHECK:           } else {
 // CHECK:             %[[T1:.+]] = arith.subi %[[ARG1]], %[[C1]] : index
 // CHECK:             %[[V2:.+]] = memref.load %[[BUFO]][%[[T1]]]
 // CHECK:             %[[V3:.+]] = memref.load %[[BUFI]][%[[T1]]]
 // CHECK:             %[[V4:.+]] = arith.addi %[[V2]], %[[V3]] : i32
 // CHECK:             memref.store %[[V4]], %[[BUFO]][%[[ARG1]]]
+// CHECK:             memref.store %[[V4]], %[[ACC]][]
 // CHECK:           }
 
 // -----
 
 func @scan_2d(%0: memref<16x32xi32>, %1: memref<16x32xi32>) {
-  %c0 = arith.constant 0 : i32
+  %t0 = memref.alloc() : memref<32xi32>
   iree_linalg_ext.scan dimension(0) inclusive(true)
-    ins(%0, %c0 : memref<16x32xi32>, i32) outs(%1 : memref<16x32xi32>) {
+    ins(%0 : memref<16x32xi32>) outs(%1, %t0 : memref<16x32xi32>, memref<32xi32>) {
     ^bb0(%arg0 : i32, %arg1 : i32):
       %sum = arith.addi %arg0, %arg1 : i32
       iree_linalg_ext.yield %sum : i32
@@ -585,6 +589,7 @@
 // CHECK-DAG:     %[[C32:.+]] = arith.constant 32 : index
 // CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
 // CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[ACC:.+]] = memref.alloc() : memref<32xi32>
 // CHECK:         scf.for %[[ARG1:.+]] = %[[C0]] to %[[C16]] step %[[C1]]
 // CHECK:           scf.for %[[ARG2:.+]] = %[[C0]] to %[[C32]] step %[[C1]]
 // CHECK:             %[[COND:.+]] = arith.cmpi eq, %[[ARG1]], %[[C0]] : index
@@ -597,4 +602,5 @@
 // CHECK:               %[[V3:.+]] = memref.load %[[BUFI]][%[[ARG1]], %[[ARG2]]]
 // CHECK:               %[[V4:.+]] = arith.addi %[[V2]], %[[V3]] : i32
 // CHECK:               memref.store %[[V4]], %[[BUFO]][%[[ARG1]], %[[ARG2]]]
+// CHECK:               memref.store %[[V4]], %[[ACC]][%[[ARG2]]]
 // CHECK:             }
diff --git a/integrations/tensorflow/iree-dialects/test/iree_linalgext/pad_contraction_to_block_size.mlir b/integrations/tensorflow/iree-dialects/test/iree_linalgext/pad_contraction_to_block_size.mlir
index e9377d6..385bff8 100644
--- a/integrations/tensorflow/iree-dialects/test/iree_linalgext/pad_contraction_to_block_size.mlir
+++ b/integrations/tensorflow/iree-dialects/test/iree_linalgext/pad_contraction_to_block_size.mlir
@@ -3,19 +3,19 @@
 // CHECK-LABEL: @pad_matmul_static
 // Full verification is done on this case. Others use reduced checks.
 // CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[VAL_4:.*]] = linalg.pad_tensor %arg0 low[0, 0] high[6, 12]  {
+// CHECK:           %[[VAL_4:.*]] = tensor.pad %arg0 low[0, 0] high[6, 12]  {
 // CHECK:           ^bb0(%[[VAL_5:.*]]: index, %[[VAL_6:.*]]: index):
-// CHECK:             linalg.yield %[[VAL_3]] : f32
+// CHECK:             tensor.yield %[[VAL_3]] : f32
 // CHECK:           } : tensor<250x500xf32> to tensor<256x512xf32>
 // CHECK:           %[[VAL_7:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[VAL_8:.*]] = linalg.pad_tensor %arg1 low[0, 0] high[12, 4]  {
+// CHECK:           %[[VAL_8:.*]] = tensor.pad %arg1 low[0, 0] high[12, 4]  {
 // CHECK:           ^bb0(%[[VAL_9:.*]]: index, %[[VAL_10:.*]]: index):
-// CHECK:             linalg.yield %[[VAL_7]] : f32
+// CHECK:             tensor.yield %[[VAL_7]] : f32
 // CHECK:           } : tensor<500x1020xf32> to tensor<512x1024xf32>
 // CHECK:           %[[VAL_11:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[VAL_12:.*]] = linalg.pad_tensor %arg2 low[0, 0] high[6, 4]  {
+// CHECK:           %[[VAL_12:.*]] = tensor.pad %arg2 low[0, 0] high[6, 4]  {
 // CHECK:           ^bb0(%[[VAL_13:.*]]: index, %[[VAL_14:.*]]: index):
-// CHECK:             linalg.yield %[[VAL_11]] : f32
+// CHECK:             tensor.yield %[[VAL_11]] : f32
 // CHECK:           } : tensor<250x1020xf32> to tensor<256x1024xf32>
 // CHECK:           %[[VAL_15:.*]] = linalg.matmul ins(%[[VAL_16:.*]], %[[VAL_17:.*]] : tensor<256x512xf32>, tensor<512x1024xf32>) outs(%[[VAL_18:.*]] : tensor<256x1024xf32>) -> tensor<256x1024xf32>
 // CHECK:           %[[VAL_19:.*]] = tensor.extract_slice %[[VAL_15]][0, 0] [250, 1020] [1, 1] : tensor<256x1024xf32> to tensor<250x1020xf32>
@@ -49,10 +49,10 @@
 // CHECK:           %[[LHS_ALIGN:.*]] = arith.constant 16 : index
 // CHECK:           %[[LHS_DIM_ALIGNED:.*]] = iree_input.align %[[LHS_DIM]], %[[LHS_ALIGN]] : index
 // CHECK:           %[[LHS_ZERO:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[LHS_PADDED:.*]] = linalg.pad_tensor %arg0 low[0, 0] high{{\[}}%[[LHS_DIM_ALIGNED]], 0]   {
+// CHECK:           %[[LHS_PADDED:.*]] = tensor.pad %arg0 low[0, 0] high{{\[}}%[[LHS_DIM_ALIGNED]], 0]   {
 // CHECK:           } : tensor<?x512xf32> to tensor<?x512xf32>
 // Pad Output:
-// CHECK:           %[[OUTPUT_PADDED:.*]] = linalg.pad_tensor %arg2 low[0, 0] high{{\[}}{{.*}}, 0]  {
+// CHECK:           %[[OUTPUT_PADDED:.*]] = tensor.pad %arg2 low[0, 0] high{{\[}}{{.*}}, 0]  {
 // CHECK:           } : tensor<?x1024xf32> to tensor<?x1024xf32>
 // Matmul:
 // CHECK:           %[[PADDED_RESULT:.*]] = linalg.matmul ins(%[[LHS_PADDED]], %arg1 : tensor<?x512xf32>, tensor<512x1024xf32>) outs(%[[OUTPUT_PADDED]] : tensor<?x1024xf32>) -> tensor<?x1024xf32>
@@ -74,12 +74,12 @@
 // Pad RHS:
 // CHECK:           %[[RHS_ALIGNMENT:.*]] = arith.constant 32 : index
 // CHECK:           %[[RHS_ALIGNED_DIM:.*]] = iree_input.align %{{.*}}, %[[RHS_ALIGNMENT]] : index
-// CHECK:           %[[RHS_PADDED:.*]] = linalg.pad_tensor %arg1 low[0, 0] high[0, %[[RHS_ALIGNED_DIM]]]  {
+// CHECK:           %[[RHS_PADDED:.*]] = tensor.pad %arg1 low[0, 0] high[0, %[[RHS_ALIGNED_DIM]]]  {
 // CHECK:           } : tensor<512x?xf32> to tensor<512x?xf32>
 // Pad Output:
 // CHECK:           %[[OUTPUT_ALIGNMENT:.*]] = arith.constant 32 : index
 // CHECK:           %[[OUTPUT_ALIGNED_DIM:.*]] = iree_input.align %{{.*}}, %[[OUTPUT_ALIGNMENT]] : index
-// CHECK:           %[[OUTPUT_PADDED:.*]] = linalg.pad_tensor %arg2 low[0, 0] high[0, %[[OUTPUT_ALIGNED_DIM]]]  {
+// CHECK:           %[[OUTPUT_PADDED:.*]] = tensor.pad %arg2 low[0, 0] high[0, %[[OUTPUT_ALIGNED_DIM]]]  {
 // CHECK:           } : tensor<256x?xf32> to tensor<256x?xf32>
 // Matmul:
 // CHECK:           %{{.*}} = linalg.matmul ins(%arg0, %[[RHS_PADDED]] : tensor<256x512xf32>, tensor<512x?xf32>) outs(%[[OUTPUT_PADDED]] : tensor<256x?xf32>) -> tensor<256x?xf32>
diff --git a/integrations/tensorflow/iree-dialects/test/iree_linalgext/pad_tiling.mlir b/integrations/tensorflow/iree-dialects/test/iree_linalgext/pad_tiling.mlir
new file mode 100644
index 0000000..f71ae8f
--- /dev/null
+++ b/integrations/tensorflow/iree-dialects/test/iree_linalgext/pad_tiling.mlir
@@ -0,0 +1,41 @@
+// RUN: iree-dialects-opt -iree-linalg-ext-tile -split-input-file %s | FileCheck  %s
+// XFAIL: *
+// TODO: Re-enable when upstream tensor.pad op properly implements the tiling
+// interface.
+
+func @pad_tensor(%arg0 : tensor<?x?xf32>, %arg1 : index, %arg2 : index,
+    %arg3 : index, %arg4 : index, %arg5 : f32) -> tensor<?x?xf32> {
+  %0 = tensor.pad %arg0 low[%arg1, %arg2] high[%arg3, %arg4] {
+    ^bb0(%arg6 : index, %arg7 : index):
+      tensor.yield %arg5 : f32
+  } {__internal_linalg_transform__ = "tiling_input"}
+      :  tensor<?x?xf32> to tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0, s1, s2] -> (s2 + s0 + s1)>
+//      CHECK: func @pad_tensor
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:   %[[ARG4:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:   %[[ARG5:[a-zA-Z0-9]+]]: f32
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
+//  CHECK-DAG:   %[[INIT:.+]] = linalg.init_tensor
+//      CHECK:   %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+//      CHECK:   %[[UBY:.+]] = affine.apply #[[MAP0]]()[%[[ARG1]], %[[ARG3]], %[[D0]]]
+//      CHECK:   %[[D1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
+//      CHECK:   %[[UBX:.+]] = affine.apply #[[MAP0]]()[%[[ARG2]], %[[ARG4]], %[[D1]]]
+//      CHECK:   %[[RESULT:.+]] = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[UBY]] step %[[C10]]
+// CHECK-SAME:       iter_args(%[[ARG7:.+]] = %[[INIT]])
+//      CHECK:     %[[YIELD:.+]] = scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[UBX]] step %[[C20]]
+// CHECK-SAME:         iter_args(%[[ARG9:.+]] = %[[ARG7]])
+//      CHECK:       %[[PAD_TILE:.+]] = scf.if
+//      CHECK:       %[[INSERT:.+]] = tensor.insert_slice %[[PAD_TILE]] into %[[ARG9]]
+// CHECK-SAME:           [%[[IV0]], %[[IV1]]]
+//      CHECK:       scf.yield %[[INSERT]]
+//      CHECK:     scf.yield %[[YIELD]]
+//      CHECK:   return %[[RESULT]]
diff --git a/integrations/tensorflow/iree-dialects/test/iree_linalgext/tiling.mlir b/integrations/tensorflow/iree-dialects/test/iree_linalgext/tiling.mlir
index 20cb462..901c157 100644
--- a/integrations/tensorflow/iree-dialects/test/iree_linalgext/tiling.mlir
+++ b/integrations/tensorflow/iree-dialects/test/iree_linalgext/tiling.mlir
@@ -1170,115 +1170,79 @@
 
 // -----
 
-func @pad_tensor(%arg0 : tensor<?x?xf32>, %arg1 : index, %arg2 : index,
-    %arg3 : index, %arg4 : index, %arg5 : f32) -> tensor<?x?xf32> {
-  %0 = linalg.pad_tensor %arg0 low[%arg1, %arg2] high[%arg3, %arg4] {
-    ^bb0(%arg6 : index, %arg7 : index):
-      linalg.yield %arg5 : f32
-  } {__internal_linalg_transform__ = "tiling_input"}
-      :  tensor<?x?xf32> to tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
-//  CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0, s1, s2] -> (s2 + s0 + s1)>
-//      CHECK: func @pad_tensor
-// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
-// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]: index
-// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9]+]]: index
-// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9]+]]: index
-// CHECK-SAME:   %[[ARG4:[a-zA-Z0-9]+]]: index
-// CHECK-SAME:   %[[ARG5:[a-zA-Z0-9]+]]: f32
-//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-//  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
-//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
-//  CHECK-DAG:   %[[INIT:.+]] = linalg.init_tensor
-//      CHECK:   %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
-//      CHECK:   %[[UBY:.+]] = affine.apply #[[MAP0]]()[%[[ARG1]], %[[ARG3]], %[[D0]]]
-//      CHECK:   %[[D1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
-//      CHECK:   %[[UBX:.+]] = affine.apply #[[MAP0]]()[%[[ARG2]], %[[ARG4]], %[[D1]]]
-//      CHECK:   %[[RESULT:.+]] = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[UBY]] step %[[C10]]
-// CHECK-SAME:       iter_args(%[[ARG7:.+]] = %[[INIT]])
-//      CHECK:     %[[YIELD:.+]] = scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[UBX]] step %[[C20]]
-// CHECK-SAME:         iter_args(%[[ARG9:.+]] = %[[ARG7]])
-//      CHECK:       %[[PAD_TILE:.+]] = scf.if
-//      CHECK:       %[[INSERT:.+]] = tensor.insert_slice %[[PAD_TILE]] into %[[ARG9]]
-// CHECK-SAME:           [%[[IV0]], %[[IV1]]]
-//      CHECK:       scf.yield %[[INSERT]]
-//      CHECK:     scf.yield %[[YIELD]]
-//      CHECK:   return %[[RESULT]]
-
-// -----
-
 func @scan_1d(%0: tensor<128xi32>) -> tensor<128xi32> {
-  %c0 = arith.constant 0 : i32
+  %c0 = linalg.init_tensor [] : tensor<i32>
   %1 = linalg.init_tensor [128] : tensor<128xi32>
-  %2 = iree_linalg_ext.scan
+  %2:2 = iree_linalg_ext.scan
     dimension(0) inclusive(true)
     {__internal_linalg_transform__ = "outer_reduce_input"}
-    ins(%0, %c0 : tensor<128xi32>, i32) outs(%1 : tensor<128xi32>) {
+    ins(%0 : tensor<128xi32>) outs(%1, %c0 : tensor<128xi32>, tensor<i32>) {
     ^bb0(%arg0 : i32, %arg1 : i32):
       %sum = arith.addi %arg0, %arg1 : i32
       iree_linalg_ext.yield %sum : i32
-  } -> tensor<128xi32>
-  return %2 : tensor<128xi32>
+  } -> tensor<128xi32>, tensor<i32>
+  return %2#0 : tensor<128xi32>
 }
 //      CHECK: func @scan_1d(
 // CHECK-SAME:   %[[OPERAND:.+]]: tensor<128xi32>
-//      CHECK:   %[[IDENTITY:.+]] = arith.constant 0 : i32
+//      CHECK:   %[[ACC:.+]] = linalg.init_tensor [] : tensor<i32>
 //      CHECK:   %[[OUTPUT:.+]] = linalg.init_tensor [128] : tensor<128xi32>
-//      CHECK:   %[[RESULT:.+]] = iree_linalg_ext.scan
+//      CHECK:   %[[RESULT:.+]]:2 = iree_linalg_ext.scan
 // CHECK-SAME:           __internal_linalg_transform__ = "outer_reduce_output"
-// CHECK-SAME:       ins(%[[OPERAND]], %[[IDENTITY]] :
-// CHECK-SAME:       outs(%[[OUTPUT]] :
+// CHECK-SAME:       ins(%[[OPERAND]] :
+// CHECK-SAME:       outs(%[[OUTPUT]], %[[ACC]] :
 //      CHECK:   return %[[RESULT]]
 
 // -----
 
 func @scan_2d(%0: tensor<16x32xi32>) -> tensor<16x32xi32> {
-  %c0 = arith.constant 0 : i32
+  %c0 = linalg.init_tensor [32] : tensor<32xi32>
   %1 = linalg.init_tensor [16, 32] : tensor<16x32xi32>
-  %2 = iree_linalg_ext.scan
+  %2:2 = iree_linalg_ext.scan
     dimension(0) inclusive(true)
     {__internal_linalg_transform__ = "outer_reduce_input"}
-    ins(%0, %c0 : tensor<16x32xi32>, i32) outs(%1 : tensor<16x32xi32>) {
+    ins(%0 : tensor<16x32xi32>) outs(%1, %c0 : tensor<16x32xi32>, tensor<32xi32>) {
     ^bb0(%arg0 : i32, %arg1 : i32):
       %sum = arith.addi %arg0, %arg1 : i32
       iree_linalg_ext.yield %sum : i32
-  } -> tensor<16x32xi32>
-  return %2 : tensor<16x32xi32>
+  } -> tensor<16x32xi32>, tensor<32xi32>
+  return %2#0 : tensor<16x32xi32>
 }
 //  CHECK-DAG:  #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
 //      CHECK:  func @scan_2d(
 // CHECK-SAME:    %[[ARG0:[a-zA-Z0-9_]+]]
-//  CHECK-DAG:    %[[IDENTITY:.+]] = arith.constant 0 : i32
 //      CHECK:    %[[C0:.+]] = arith.constant 0 : index
 //      CHECK:    %[[C16:.+]] = arith.constant 16 : index
 //      CHECK:    %[[C32:.+]] = arith.constant 32 : index
 //      CHECK:    %[[C20:.+]] = arith.constant 20 : index
+//      CHECK:    %[[ACC:.+]] = linalg.init_tensor [32] : tensor<32xi32>
 //      CHECK:    %[[OUTPUT:.+]] = linalg.init_tensor [16, 32] : tensor<16x32xi32>
-//      CHECK:    %[[RESULT:.+]] = scf.for %[[I:.+]] = %[[C0]] to %[[C32]] step %[[C20]] 
-// CHECK-SAME:      iter_args(%[[ARG2:.+]] = %[[OUTPUT]])
+//      CHECK:    %[[RESULT:.+]]:2 = scf.for %[[I:.+]] = %[[C0]] to %[[C32]] step %[[C20]]
+// CHECK-SAME:      iter_args(%[[ARG2:.+]] = %[[OUTPUT]], %[[ARG3:.+]] = %[[ACC]])
 //      CHECK:      %[[SIZE:.+]] = affine.min #[[MAP0]](%[[I]])[%[[C20]], %[[C32]]]
 //      CHECK:      %[[UPDATE_SLICE_IN:.+]] = tensor.extract_slice %[[ARG0]][0, %[[I]]] [%[[C16]], %[[SIZE]]]
-//      CHECK:      %[[UPDATE_SLICE_OUT:.+]] = tensor.extract_slice %[[OUTPUT]][0, %[[I]]] [%[[C16]], %[[SIZE]]]
-//      CHECK:      %[[SCAN_TILE:.+]] = iree_linalg_ext.scan
+//      CHECK:      %[[UPDATE_SLICE_OUT:.+]] = tensor.extract_slice %[[ARG2]][0, %[[I]]] [%[[C16]], %[[SIZE]]]
+//      CHECK:      %[[UPDATE_SLICE_ACC:.+]] = tensor.extract_slice %[[ARG3]][%[[I]]] [%[[SIZE]]]
+//      CHECK:      %[[SCAN_TILE:.+]]:2 = iree_linalg_ext.scan
 // CHECK-SAME:       dimension(0) inclusive(true)
 // CHECK-SAME:       {__internal_linalg_transform__ = "outer_reduce_output"}
 // CHECK-SAME:       ins(%[[UPDATE_SLICE_IN]]
-// CHECK-SAME:       outs(%[[UPDATE_SLICE_OUT]]
-//      CHECK:       %[[YIELD:.+]] = tensor.insert_slice %[[SCAN_TILE]] into %[[ARG2]][0, %[[I]]]
+// CHECK-SAME:       outs(%[[UPDATE_SLICE_OUT]], %[[UPDATE_SLICE_ACC]]
+//      CHECK:       %[[YIELD:.+]] = tensor.insert_slice %[[SCAN_TILE]]#0 into %[[ARG2]][0, %[[I]]]
 // CHECK-SAME:           [%[[C16]], %[[SIZE]]]
-//      CHECK:       scf.yield %[[YIELD]]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:       %[[ACC_YIELD:.+]] = tensor.insert_slice %[[SCAN_TILE]]#1 into %[[ARG3]][%[[I]]]
+// CHECK-SAME:           [%[[SIZE]]]
+//      CHECK:       scf.yield %[[YIELD]], %[[ACC_YIELD]] : tensor<16x32xi32>, tensor<32xi32>
+//      CHECK:   return %[[RESULT]]#0
 
 // -----
 
 func @scan_2d_memref(%0: memref<16x32xi32>, %1: memref<16x32xi32>) {
-  %c0 = arith.constant 0 : i32
+  %c0 = memref.alloc() : memref<32xi32>
   iree_linalg_ext.scan
     dimension(0) inclusive(true)
     {__internal_linalg_transform__ = "outer_reduce_input"}
-    ins(%0, %c0 : memref<16x32xi32>, i32) outs(%1 : memref<16x32xi32>) {
+    ins(%0 : memref<16x32xi32>) outs(%1, %c0 : memref<16x32xi32>, memref<32xi32>) {
     ^bb0(%arg0 : i32, %arg1 : i32):
       %sum = arith.addi %arg0, %arg1 : i32
       iree_linalg_ext.yield %sum : i32
@@ -1290,18 +1254,19 @@
 //      CHECK:  func @scan_2d_memref(
 // CHECK-SAME:    %[[ARG0:[a-zA-Z0-9_]+]]
 // CHECK-SAME:    %[[ARG1:[a-zA-Z0-9_]+]]
-//  CHECK-DAG:    %[[IDENTITY:.+]] = arith.constant 0 : i32
 //      CHECK:    %[[C0:.+]] = arith.constant 0 : index
 //      CHECK:    %[[C16:.+]] = arith.constant 16 : index
 //      CHECK:    %[[C32:.+]] = arith.constant 32 : index
 //      CHECK:    %[[C20:.+]] = arith.constant 20 : index
+//      CHECK:    %[[ACC:.+]] = memref.alloc() : memref<32xi32>
 //      CHECK:    scf.for %[[I:.+]] = %[[C0]] to %[[C32]] step %[[C20]]
 //      CHECK:      %[[SIZE:.+]] = affine.min #[[MAP0]](%[[I]])[%[[C20]], %[[C32]]]
 //      CHECK:      %[[UPDATE_SLICE_IN:.+]] = memref.subview %[[ARG0]][0, %[[I]]] [%[[C16]], %[[SIZE]]]
 //      CHECK:      %[[UPDATE_SLICE_OUT:.+]] = memref.subview %[[ARG1]][0, %[[I]]] [%[[C16]], %[[SIZE]]]
+//      CHECK:      %[[UPDATE_SLICE_ACC:.+]] = memref.subview %[[ACC]][%[[I]]] [%[[SIZE]]]
 //      CHECK:      iree_linalg_ext.scan
 // CHECK-SAME:       dimension(0) inclusive(true)
 // CHECK-SAME:       {__internal_linalg_transform__ = "outer_reduce_output"}
 // CHECK-SAME:       ins(%[[UPDATE_SLICE_IN]]
-// CHECK-SAME:       outs(%[[UPDATE_SLICE_OUT]]
+// CHECK-SAME:       outs(%[[UPDATE_SLICE_OUT]], %[[UPDATE_SLICE_ACC]]
 //      CHECK:   return
diff --git a/integrations/tensorflow/iree-dialects/test/python/iree_pydm/importer/booleans.py b/integrations/tensorflow/iree-dialects/test/python/iree_pydm/importer/booleans.py
index 1c856fa..9d86f37 100644
--- a/integrations/tensorflow/iree-dialects/test/python/iree_pydm/importer/booleans.py
+++ b/integrations/tensorflow/iree-dialects/test/python/iree_pydm/importer/booleans.py
@@ -7,7 +7,7 @@
 # CHECK-LABEL: @logical_and
 # CHECK: %[[XVAL:.*]] = load_var %x
 # CHECK: %[[XBOOL:.*]] = as_bool %[[XVAL]]
-# CHECK: %[[R1:.*]] = functional_if %[[XBOOL]] {{.*}} {
+# CHECK: %[[R1:.*]] = functional_if %[[XBOOL]] {{.*}}{
 # CHECK:   %[[YVAL:.*]] = load_var %y
 # CHECK:   %[[YBOOL:.*]] = as_bool %[[YVAL]]
 # CHECK:   %[[R2:.*]] = functional_if %[[YBOOL]] {{.*}} {
@@ -31,7 +31,7 @@
 # # CHECK-LABEL: @logical_or
 # CHECK: %[[XVAL:.*]] = load_var %x
 # CHECK: %[[XBOOL:.*]] = as_bool %[[XVAL]]
-# CHECK: %[[R1:.*]] = functional_if %[[XBOOL]] {{.*}} {
+# CHECK: %[[R1:.*]] = functional_if %[[XBOOL]] {{.*}}{
 # CHECK:   yield %[[XVAL]]
 # CHECK: } else {
 # CHECK:   %[[YVAL:.*]] = load_var %y
@@ -67,7 +67,7 @@
 # CHECK-LABEL: func @conditional
 # CHECK: %[[XVAL:.*]] = load_var %x
 # CHECK: %[[XBOOL:.*]] = as_bool %[[XVAL]]
-# CHECK: %[[R1:.*]] = functional_if %[[XBOOL]] {{.*}} {
+# CHECK: %[[R1:.*]] = functional_if %[[XBOOL]] {{.*}}{
 # CHECK:   %[[TWOVAL:.*]] = constant 2
 # CHECK:   %[[TWOBOXED:.*]] = box %[[TWOVAL]] : !iree_pydm.integer ->
 # CHECK:   yield %[[TWOBOXED]]
diff --git a/integrations/tensorflow/iree-dialects/test/python/iree_pydm/importer/comparison.py b/integrations/tensorflow/iree-dialects/test/python/iree_pydm/importer/comparison.py
index 1a0efc7..2ba7048 100644
--- a/integrations/tensorflow/iree-dialects/test/python/iree_pydm/importer/comparison.py
+++ b/integrations/tensorflow/iree-dialects/test/python/iree_pydm/importer/comparison.py
@@ -115,7 +115,7 @@
 # CHECK-DAG: %[[Y:.*]] = load_var %y
 # CHECK: %[[XP:.*]], %[[YP:.*]] = dynamic_binary_promote %[[X]], %[[Y]]
 # CHECK: %[[R1:.*]] = apply_compare "lt", %[[XP]], %[[YP]]
-# CHECK: %[[RESULT:.*]] = functional_if %[[R1]] {{.*}} {
+# CHECK: %[[RESULT:.*]] = functional_if %[[R1]] {{.*}}{
 # CHECK:   %[[Z:.*]] = load_var %z
 # NOTE: Promotion happens on original loaded values, not already promoted
 # values.
@@ -144,7 +144,7 @@
 
 # CHECK-LABEL: nested_short_circuit_expression
 # Verify that the nested expression is evaluated in the context of the if.
-# CHECK: functional_if {{.*}} {
+# CHECK: functional_if {{.*}}{
 # CHECK:   apply_binary "add"
 # CHECK: } else {
 @test_import_global
diff --git a/integrations/tensorflow/iree_tf_compiler/TF/LowerGlobalTensors.cpp b/integrations/tensorflow/iree_tf_compiler/TF/LowerGlobalTensors.cpp
index 4232e23..20ca6e7 100644
--- a/integrations/tensorflow/iree_tf_compiler/TF/LowerGlobalTensors.cpp
+++ b/integrations/tensorflow/iree_tf_compiler/TF/LowerGlobalTensors.cpp
@@ -9,6 +9,7 @@
 #include "iree_tf_compiler/TF/Passes.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/MLIRContext.h"
@@ -18,7 +19,6 @@
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/Utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
diff --git a/integrations/tensorflow/iree_tf_compiler/TF/SavedModelToIreeABI.cpp b/integrations/tensorflow/iree_tf_compiler/TF/SavedModelToIreeABI.cpp
index 9e88332..cc7cde8 100644
--- a/integrations/tensorflow/iree_tf_compiler/TF/SavedModelToIreeABI.cpp
+++ b/integrations/tensorflow/iree_tf_compiler/TF/SavedModelToIreeABI.cpp
@@ -21,6 +21,7 @@
 #include "iree_tf_compiler/TF/Passes.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/JSON.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/MLIRContext.h"
@@ -29,7 +30,6 @@
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/Utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 
@@ -409,8 +409,8 @@
                                     StringRef exportedName) {
   Location loc = internalFunc.getLoc();
   OpBuilder builder(internalFunc);
-  const Identifier savedModelIndexPathIdent =
-      builder.getIdentifier("tf_saved_model.index_path");
+  const StringAttr savedModelIndexPathIdent =
+      builder.getStringAttr("tf_saved_model.index_path");
   FunctionType internalFuncType = internalFunc.getType();
   json::Array refArgs;
   json::Array refReturns;
@@ -585,8 +585,8 @@
 
   LogicalResult run() {
     mlir::Builder builder(getOperation());
-    const Identifier savedModelIndexPathIdent =
-        builder.getIdentifier("tf_saved_model.index_path");
+    const StringAttr savedModelIndexPathIdent =
+        builder.getStringAttr("tf_saved_model.index_path");
     (void)savedModelIndexPathIdent;
 
     // Handle saved model exported functions.
diff --git a/integrations/tensorflow/test/iree_tfl_tests/llvmaot_posenet_i8.run b/integrations/tensorflow/test/iree_tfl_tests/llvmaot_posenet_i8.run
index cf2020d..2f4de27 100644
--- a/integrations/tensorflow/test/iree_tfl_tests/llvmaot_posenet_i8.run
+++ b/integrations/tensorflow/test/iree_tfl_tests/llvmaot_posenet_i8.run
@@ -1,3 +1,2 @@
 # REQUIRES: llvmaot
 # RUN: %PYTHON -m iree_tfl_tests.posenet_i8_test --target_backend=llvmaot -artifacts_dir=%t
-# XFAIL: *
diff --git a/iree/compiler/Bindings/Native/Transforms/BUILD b/iree/compiler/Bindings/Native/Transforms/BUILD
index 0fee820..60ee5b8 100644
--- a/iree/compiler/Bindings/Native/Transforms/BUILD
+++ b/iree/compiler/Bindings/Native/Transforms/BUILD
@@ -25,6 +25,7 @@
         "//iree/compiler/Dialect/Util/IR",
         "//iree/compiler/Utils",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AffineUtils",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Shape",
diff --git a/iree/compiler/Bindings/Native/Transforms/CMakeLists.txt b/iree/compiler/Bindings/Native/Transforms/CMakeLists.txt
index ae88184..56ef523 100644
--- a/iree/compiler/Bindings/Native/Transforms/CMakeLists.txt
+++ b/iree/compiler/Bindings/Native/Transforms/CMakeLists.txt
@@ -20,6 +20,7 @@
     "WrapEntryPoints.cpp"
   DEPS
     LLVMSupport
+    MLIRAffineUtils
     MLIRIR
     MLIRPass
     MLIRShape
diff --git a/iree/compiler/Bindings/Native/Transforms/WrapEntryPoints.cpp b/iree/compiler/Bindings/Native/Transforms/WrapEntryPoints.cpp
index a3873b8..7fc9d44 100644
--- a/iree/compiler/Bindings/Native/Transforms/WrapEntryPoints.cpp
+++ b/iree/compiler/Bindings/Native/Transforms/WrapEntryPoints.cpp
@@ -7,6 +7,7 @@
 #include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Attributes.h"
@@ -16,7 +17,6 @@
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/Utils.h"
 
 namespace mlir {
 namespace iree_compiler {
diff --git a/iree/compiler/Bindings/TFLite/Transforms/BUILD b/iree/compiler/Bindings/TFLite/Transforms/BUILD
index 8482307..b20f76e 100644
--- a/iree/compiler/Bindings/TFLite/Transforms/BUILD
+++ b/iree/compiler/Bindings/TFLite/Transforms/BUILD
@@ -26,6 +26,7 @@
         "//iree/compiler/Dialect/Util/IR",
         "//iree/compiler/Utils",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AffineUtils",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Shape",
diff --git a/iree/compiler/Bindings/TFLite/Transforms/CMakeLists.txt b/iree/compiler/Bindings/TFLite/Transforms/CMakeLists.txt
index 2e8ee00..7c90253 100644
--- a/iree/compiler/Bindings/TFLite/Transforms/CMakeLists.txt
+++ b/iree/compiler/Bindings/TFLite/Transforms/CMakeLists.txt
@@ -20,6 +20,7 @@
     "WrapEntryPoints.cpp"
   DEPS
     LLVMSupport
+    MLIRAffineUtils
     MLIRIR
     MLIRPass
     MLIRShape
diff --git a/iree/compiler/Bindings/TFLite/Transforms/WrapEntryPoints.cpp b/iree/compiler/Bindings/TFLite/Transforms/WrapEntryPoints.cpp
index e73966e..ba49e4c 100644
--- a/iree/compiler/Bindings/TFLite/Transforms/WrapEntryPoints.cpp
+++ b/iree/compiler/Bindings/TFLite/Transforms/WrapEntryPoints.cpp
@@ -10,6 +10,7 @@
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Attributes.h"
@@ -19,7 +20,6 @@
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/Utils.h"
 
 namespace mlir {
 namespace iree_compiler {
diff --git a/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp b/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
index 4bf0cf9..378a408 100644
--- a/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
+++ b/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
@@ -31,6 +31,7 @@
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/Transforms/BufferUtils.h"
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
 #include "mlir/Dialect/Linalg/ComprehensiveBufferize/ModuleBufferization.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -48,7 +49,6 @@
 #include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
-#include "mlir/Transforms/BufferUtils.h"
 #include "mlir/Transforms/Passes.h"
 
 #define DEBUG_TYPE "iree-codegen-linalg-bufferize"
diff --git a/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp b/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
index 099a7e5..02a43d4 100644
--- a/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
+++ b/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
@@ -6,6 +6,7 @@
 
 #include "iree/compiler/Codegen/PassDetail.h"
 #include "iree/compiler/Codegen/Passes.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
@@ -13,7 +14,6 @@
 #include "mlir/Dialect/Vector/VectorTransforms.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/LoopUtils.h"
 
 namespace mlir {
 namespace iree_compiler {
diff --git a/iree/compiler/Codegen/SPIRV/BUILD b/iree/compiler/Codegen/SPIRV/BUILD
index 2e94325..e0a2bd5 100644
--- a/iree/compiler/Codegen/SPIRV/BUILD
+++ b/iree/compiler/Codegen/SPIRV/BUILD
@@ -51,6 +51,7 @@
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Affine",
         "@llvm-project//mlir:AffineToStandardTransforms",
+        "@llvm-project//mlir:AffineUtils",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithmeticDialect",
         "@llvm-project//mlir:ArithmeticToSPIRV",
diff --git a/iree/compiler/Codegen/SPIRV/CMakeLists.txt b/iree/compiler/Codegen/SPIRV/CMakeLists.txt
index 455476b..3928c3f 100644
--- a/iree/compiler/Codegen/SPIRV/CMakeLists.txt
+++ b/iree/compiler/Codegen/SPIRV/CMakeLists.txt
@@ -41,6 +41,7 @@
     LLVMSupport
     MLIRAffine
     MLIRAffineToStandard
+    MLIRAffineUtils
     MLIRAnalysis
     MLIRArithmetic
     MLIRArithmeticToSPIRV
diff --git a/iree/compiler/Codegen/SPIRV/SPIRVTileAndDistribute.cpp b/iree/compiler/Codegen/SPIRV/SPIRVTileAndDistribute.cpp
index 1e39e89..af35198 100644
--- a/iree/compiler/Codegen/SPIRV/SPIRVTileAndDistribute.cpp
+++ b/iree/compiler/Codegen/SPIRV/SPIRVTileAndDistribute.cpp
@@ -21,6 +21,7 @@
 #include "iree/compiler/Codegen/Utils/MarkerUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
@@ -37,7 +38,6 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/FoldUtils.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/LoopUtils.h"
 
 #define DEBUG_TYPE "iree-spirv-tile-and-distribute"
 
diff --git a/iree/compiler/Dialect/Modules/VMVX/Transforms/Passes.cpp b/iree/compiler/Dialect/Modules/VMVX/Transforms/Passes.cpp
index d715b0f..7804fe0 100644
--- a/iree/compiler/Dialect/Modules/VMVX/Transforms/Passes.cpp
+++ b/iree/compiler/Dialect/Modules/VMVX/Transforms/Passes.cpp
@@ -63,7 +63,7 @@
   nestedModulePM.addPass(createCanonicalizerPass());
   nestedModulePM.addPass(createCSEPass());
   nestedModulePM.addPass(createFlattenMemRefSubspanPass());
-  nestedModulePM.addPass(createNormalizeMemRefsPass());
+  nestedModulePM.addPass(memref::createNormalizeMemRefsPass());
   nestedModulePM.addNestedPass<FuncOp>(createAffineScalarReplacementPass());
   nestedModulePM.addPass(createCanonicalizerPass());
 }
diff --git a/iree/compiler/Dialect/VM/Transforms/BUILD b/iree/compiler/Dialect/VM/Transforms/BUILD
index 426edfa..9f9e500 100644
--- a/iree/compiler/Dialect/VM/Transforms/BUILD
+++ b/iree/compiler/Dialect/VM/Transforms/BUILD
@@ -36,6 +36,8 @@
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Affine",
         "@llvm-project//mlir:AffineToStandardTransforms",
+        "@llvm-project//mlir:AffineTransforms",
+        "@llvm-project//mlir:AffineUtils",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:MemRefDialect",
diff --git a/iree/compiler/Dialect/VM/Transforms/CMakeLists.txt b/iree/compiler/Dialect/VM/Transforms/CMakeLists.txt
index cc69f45..03275c2 100644
--- a/iree/compiler/Dialect/VM/Transforms/CMakeLists.txt
+++ b/iree/compiler/Dialect/VM/Transforms/CMakeLists.txt
@@ -27,6 +27,8 @@
     LLVMSupport
     MLIRAffine
     MLIRAffineToStandard
+    MLIRAffineTransforms
+    MLIRAffineUtils
     MLIRIR
     MLIRMath
     MLIRMemRef
diff --git a/iree/compiler/Dialect/VM/Transforms/GlobalInitialization.cpp b/iree/compiler/Dialect/VM/Transforms/GlobalInitialization.cpp
index 38d9f09..703a7c1 100644
--- a/iree/compiler/Dialect/VM/Transforms/GlobalInitialization.cpp
+++ b/iree/compiler/Dialect/VM/Transforms/GlobalInitialization.cpp
@@ -7,6 +7,7 @@
 #include "iree/compiler/Dialect/VM/IR/VMOps.h"
 #include "iree/compiler/Dialect/VM/Transforms/Passes.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/MLIRContext.h"
@@ -16,7 +17,6 @@
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/InliningUtils.h"
-#include "mlir/Transforms/Utils.h"
 
 namespace mlir {
 namespace iree_compiler {
diff --git a/iree/compiler/Dialect/VM/Transforms/OrdinalAllocation.cpp b/iree/compiler/Dialect/VM/Transforms/OrdinalAllocation.cpp
index 2506607..e923543 100644
--- a/iree/compiler/Dialect/VM/Transforms/OrdinalAllocation.cpp
+++ b/iree/compiler/Dialect/VM/Transforms/OrdinalAllocation.cpp
@@ -7,6 +7,7 @@
 #include "iree/compiler/Dialect/VM/IR/VMOps.h"
 #include "iree/compiler/Dialect/VM/Transforms/Passes.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/SymbolTable.h"
@@ -14,7 +15,6 @@
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/Utils.h"
 
 namespace mlir {
 namespace iree_compiler {
diff --git a/iree/compiler/Dialect/VM/Transforms/Passes.cpp b/iree/compiler/Dialect/VM/Transforms/Passes.cpp
index db2720a..8b9f3ac 100644
--- a/iree/compiler/Dialect/VM/Transforms/Passes.cpp
+++ b/iree/compiler/Dialect/VM/Transforms/Passes.cpp
@@ -11,6 +11,7 @@
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "iree/compiler/Dialect/VM/IR/VMOps.h"
 #include "mlir/Conversion/SCFToStandard/SCFToStandard.h"
+#include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Transforms/Passes.h"
diff --git a/iree/compiler/Dialect/VM/Transforms/SinkDefiningOps.cpp b/iree/compiler/Dialect/VM/Transforms/SinkDefiningOps.cpp
index c0ab030..d29d7fe 100644
--- a/iree/compiler/Dialect/VM/Transforms/SinkDefiningOps.cpp
+++ b/iree/compiler/Dialect/VM/Transforms/SinkDefiningOps.cpp
@@ -7,6 +7,7 @@
 #include "iree/compiler/Dialect/VM/IR/VMOps.h"
 #include "iree/compiler/Dialect/VM/Transforms/Passes.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/MLIRContext.h"
@@ -16,7 +17,6 @@
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/Utils.h"
 
 namespace mlir {
 namespace iree_compiler {
diff --git a/iree/compiler/InputConversion/MHLO/BUILD b/iree/compiler/InputConversion/MHLO/BUILD
index 1fb8577..6587000 100644
--- a/iree/compiler/InputConversion/MHLO/BUILD
+++ b/iree/compiler/InputConversion/MHLO/BUILD
@@ -71,6 +71,7 @@
         "//llvm-external-projects/iree-dialects:IREELinalgExtTransforms",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Affine",
+        "@llvm-project//mlir:AffineUtils",
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:DialectUtils",
         "@llvm-project//mlir:IR",
diff --git a/iree/compiler/InputConversion/MHLO/CMakeLists.txt b/iree/compiler/InputConversion/MHLO/CMakeLists.txt
index 1eacad0..9d45f65 100644
--- a/iree/compiler/InputConversion/MHLO/CMakeLists.txt
+++ b/iree/compiler/InputConversion/MHLO/CMakeLists.txt
@@ -61,6 +61,7 @@
     LLVMSupport
     LmhloDialect
     MLIRAffine
+    MLIRAffineUtils
     MLIRComplex
     MLIRIR
     MLIRLinalg
diff --git a/iree/compiler/InputConversion/MHLO/FlattenTuplesInCFG.cpp b/iree/compiler/InputConversion/MHLO/FlattenTuplesInCFG.cpp
index ed1fd9a..18a7068 100644
--- a/iree/compiler/InputConversion/MHLO/FlattenTuplesInCFG.cpp
+++ b/iree/compiler/InputConversion/MHLO/FlattenTuplesInCFG.cpp
@@ -10,13 +10,13 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
-#include "mlir/Transforms/Utils.h"
 
 namespace mlir {
 namespace iree_compiler {
diff --git a/iree/compiler/InputConversion/MHLO/LegalizeInputTypes.cpp b/iree/compiler/InputConversion/MHLO/LegalizeInputTypes.cpp
index f2f91a2..891c063 100644
--- a/iree/compiler/InputConversion/MHLO/LegalizeInputTypes.cpp
+++ b/iree/compiler/InputConversion/MHLO/LegalizeInputTypes.cpp
@@ -9,6 +9,7 @@
 #include "iree/compiler/InputConversion/MHLO/PassDetail.h"
 #include "iree/compiler/InputConversion/MHLO/Passes.h"
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/BlockAndValueMapping.h"
@@ -22,7 +23,6 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/Utils.h"
 
 namespace mlir {
 namespace iree_compiler {
diff --git a/iree/tools/init_mlir_passes.h b/iree/tools/init_mlir_passes.h
index b3cba96..4d6c5f3 100644
--- a/iree/tools/init_mlir_passes.h
+++ b/iree/tools/init_mlir_passes.h
@@ -44,7 +44,7 @@
   registerLoopCoalescingPass();
   registerLoopInvariantCodeMotionPass();
   registerAffineScalarReplacementPass();
-  registerParallelLoopCollapsingPass();
+  registerSCFParallelLoopCollapsingPass();
   registerPrintOpStatsPass();
   registerViewOpGraphPass();
   registerStripDebugInfoPass();
diff --git a/third_party/llvm-project b/third_party/llvm-project
index 92c1c63..7cb4c26 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 92c1c63daeaf0b6b7abc6561133e2d3dbda80f8c
+Subproject commit 7cb4c2617391b80993e7c10f3a34c9e172f7ad41
diff --git a/third_party/mlir-hlo b/third_party/mlir-hlo
index 9ec5480..631843e 160000
--- a/third_party/mlir-hlo
+++ b/third_party/mlir-hlo
@@ -1 +1 @@
-Subproject commit 9ec54804459fd6d8c47dec22bc22d32f82ec4fee
+Subproject commit 631843e39eea2affa61295b3394055c873a36cd0
diff --git a/third_party/mlir-hlo.branch-pin b/third_party/mlir-hlo.branch-pin
deleted file mode 100644
index 47edf9d..0000000
--- a/third_party/mlir-hlo.branch-pin
+++ /dev/null
@@ -1 +0,0 @@
-patched-mlir-hlo-20220118