[CPU] Introduce TilingConfig class (#14082)

TilingConfig is a simple steps towards separating the API to retrieve
the tile size information from the actual representation and
implementation of such information. It will let us implement different
tiling configuration scenarios without exposing the implementation
details or even replacing LoweringConfig with something else without
impacting TilingConfig users.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
index 44a049d..00c80c2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
@@ -38,6 +38,7 @@
         "LLVMCPUVectorization.cpp",
         "Passes.cpp",
         "TargetMLTransformInfo.cpp",
+        "TileSizeSelection.cpp",
         "Utils.cpp",
         "VectorContractCustomKernels.cpp",
         "VerifyLinalgTransformLegality.cpp",
@@ -47,6 +48,7 @@
         "KernelDispatch.h",
         "LLVMCPUPasses.h",
         "TargetMLTransformInfo.h",
+        "TileSizeSelection.h",
         "Utils.h",
     ],
     deps = [
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
index b53515c..e75cb67 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
@@ -18,6 +18,7 @@
     "KernelDispatch.h"
     "LLVMCPUPasses.h"
     "TargetMLTransformInfo.h"
+    "TileSizeSelection.h"
     "Utils.h"
   SRCS
     "ConvertToLLVM.cpp"
@@ -43,6 +44,7 @@
     "LLVMCPUVectorization.cpp"
     "Passes.cpp"
     "TargetMLTransformInfo.cpp"
+    "TileSizeSelection.cpp"
     "Utils.cpp"
     "VectorContractCustomKernels.cpp"
     "VerifyLinalgTransformLegality.cpp"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.h
index 130dc2a..44a1b39 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.h
@@ -13,22 +13,6 @@
 namespace mlir {
 namespace iree_compiler {
 
-// TODO(hanchung): Create a pass to handle detailed logic about splitting tiling
-// sizes for parallel dims and reduction dims.
-// We have to fuse the fill + named_op + generic ops along parallel dims
-// firstly. At this stage, we do not apply vectorization. The reduction dim
-// won't get tiled if the case is matmul + generic op. In this case, we have to
-// tile along reduction dim again, which needs them to be TilingInterface ops.
-enum class TilingLevel : unsigned {
-  // Tile TilingInterface operations to threads.
-  WorkGroupTiles = 0,
-  // Tile TilingInterface operation on workgroup thread for parallel dims.
-  ParallelTiles = 1,
-  // Tile TilingInterface operations on workgroup thread for reduction dims.
-  ReductionTiles = 2,
-  NumTileLevels = 3
-};
-
 LogicalResult initCPULaunchConfig(ModuleOp moduleOp);
 
 }  // namespace iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
index ba5f638..59eefd3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
@@ -7,8 +7,10 @@
 #include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtDialect.h"
 #include "iree-dialects/Dialect/LinalgTransform/LinalgTransformOps.h"
 #include "iree/compiler/Codegen/Dialect/IREECodegenDialect.h"
+#include "iree/compiler/Codegen/Dialect/LoweringConfig.h"
 #include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"
 #include "iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h"
+#include "iree/compiler/Codegen/LLVMCPU/TileSizeSelection.h"
 #include "iree/compiler/Codegen/LLVMCPU/Utils.h"
 #include "iree/compiler/Codegen/PassDetail.h"
 #include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
@@ -24,6 +26,8 @@
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
 
+using mlir::iree_compiler::IREE::Codegen::LoweringConfigAttr;
+
 namespace mlir {
 namespace iree_compiler {
 
@@ -106,12 +110,53 @@
   auto walkResult = module.walk([&](Operation *op) -> WalkResult {
     IREE::Codegen::LoweringConfigAttr loweringConfig = getLoweringConfig(op);
     if (!loweringConfig) return WalkResult::advance();
-    return verificationFn(op, loweringConfig, translationInfo,
+    TilingConfig tilingConfig(loweringConfig);
+    return verificationFn(op, tilingConfig, translationInfo,
                           ArrayRef<int64_t>{});
   });
   return failure(walkResult.wasInterrupted());
 }
 
+// TODO(dcaballe): We temporarily need this utility to retrieve a valid
+// lowering config. We should be able to remove this once we have a lowering
+// config attribute per op.
+static FailureOr<LoweringConfigAttr> getRootLoweringConfig(Operation *op) {
+  // Check for self first.
+  auto rootLoweringConfig = iree_compiler::getLoweringConfig(op);
+  if (rootLoweringConfig) {
+    return rootLoweringConfig;
+  }
+
+  auto result = op->walk([&](Operation *op) -> WalkResult {
+    auto loweringConfig = iree_compiler::getLoweringConfig(op);
+    if (!loweringConfig) {
+      return WalkResult::advance();
+    }
+    if (rootLoweringConfig) {
+      if (rootLoweringConfig != loweringConfig) {
+        return WalkResult::interrupt();
+      }
+    } else {
+      rootLoweringConfig = loweringConfig;
+    }
+    return WalkResult::advance();
+  });
+
+  if (!rootLoweringConfig || result.wasInterrupted()) {
+    return failure();
+  }
+
+  return rootLoweringConfig;
+}
+
+static TilingConfig getTilingConfigForPipeline(
+    IREE::HAL::ExecutableVariantOp variantOp) {
+  auto maybeLoweringConfig = getRootLoweringConfig(variantOp);
+  assert(succeeded(maybeLoweringConfig) &&
+         "Pipeline requires a lowering config");
+  return TilingConfig(*maybeLoweringConfig);
+}
+
 void LLVMCPULowerExecutableTargetPass::runOnOperation() {
   IREE::HAL::ExecutableVariantOp variantOp = getOperation();
   ModuleOp moduleOp = variantOp.getInnerModule();
@@ -179,7 +224,8 @@
               moduleOp, translationInfo.value(),
               verifyConvTileAndDecomposeExpertConfig);
           break;
-        default:;
+        default:
+          break;
       }
       if (failed(verificationStatus)) {
         return signalPassFailure();
@@ -190,6 +236,7 @@
       bool enableVectorMasking =
           isX86(target) || isRISCV(target) ||
           (isAArch64(target) && hasAnySVEFeature(target));
+
       bool enableMicrokernels = hasMicrokernels(target);
       bool enableAArch64SSVE = isAArch64(target) && hasAnySVEFeature(target) &&
                                hasSMEFeature(target);
@@ -200,44 +247,56 @@
             addCPUDefaultPassPipeline(executableLoweringPipeline);
             break;
           case IREE::Codegen::DispatchLoweringPassPipeline::
-              CPUBufferOpsTileAndVectorize:
-            addCPUBufferOpsTileAndVectorizePipeline(executableLoweringPipeline,
-                                                    enableVectorMasking,
-                                                    enableAArch64SSVE);
+              CPUBufferOpsTileAndVectorize: {
+            TilingConfig tilingConfig = getTilingConfigForPipeline(variantOp);
+            addCPUBufferOpsTileAndVectorizePipeline(
+                executableLoweringPipeline, tilingConfig, enableVectorMasking,
+                enableAArch64SSVE);
             break;
+          }
           case IREE::Codegen::DispatchLoweringPassPipeline::
-              CPUDoubleTilingExpert:
+              CPUDoubleTilingExpert: {
+            TilingConfig tilingConfig = getTilingConfigForPipeline(variantOp);
             addMultiTilingExpertPassPipeline(
-                executableLoweringPipeline,
-                static_cast<int>(TilingLevel::NumTileLevels),
+                executableLoweringPipeline, tilingConfig,
                 /*enablePeeling=*/false, enableVectorMasking, lowerToAVX2);
             break;
+          }
           case IREE::Codegen::DispatchLoweringPassPipeline::
-              CPUDoubleTilingPadExpert:
-            addDoubleTilingPadExpertPassPipeline(executableLoweringPipeline,
-                                                 enableVectorMasking);
+              CPUDoubleTilingPadExpert: {
+            TilingConfig tilingConfig = getTilingConfigForPipeline(variantOp);
+            addDoubleTilingPadExpertPassPipeline(
+                executableLoweringPipeline, tilingConfig, enableVectorMasking);
             break;
+          }
           case IREE::Codegen::DispatchLoweringPassPipeline::
-              CPUDoubleTilingPeelingExpert:
+              CPUDoubleTilingPeelingExpert: {
+            TilingConfig tilingConfig = getTilingConfigForPipeline(variantOp);
             addMultiTilingExpertPassPipeline(
-                executableLoweringPipeline,
-                static_cast<int>(TilingLevel::NumTileLevels),
+                executableLoweringPipeline, tilingConfig,
                 /*enablePeeling=*/true, enableVectorMasking, lowerToAVX2,
                 enableAArch64SSVE);
             break;
+          }
           case IREE::Codegen::DispatchLoweringPassPipeline::
-              CPUConvTileAndDecomposeExpert:
+              CPUConvTileAndDecomposeExpert: {
+            TilingConfig tilingConfig = getTilingConfigForPipeline(variantOp);
             addConvTileAndDecomposeExpertPassPipeline(
-                executableLoweringPipeline, enableVectorMasking,
+                executableLoweringPipeline, tilingConfig, enableVectorMasking,
                 enableAArch64SSVE);
             break;
-          case IREE::Codegen::DispatchLoweringPassPipeline::Mmt4dTilingExpert:
+          }
+          case IREE::Codegen::DispatchLoweringPassPipeline::Mmt4dTilingExpert: {
+            TilingConfig tilingConfig = getTilingConfigForPipeline(variantOp);
             addMmt4dTilingExpertPassPipeline(executableLoweringPipeline,
-                                             enableMicrokernels);
+                                             tilingConfig, enableMicrokernels);
             break;
-          case IREE::Codegen::DispatchLoweringPassPipeline::CPUDataTiling:
-            addCPUDataTilingPipeline(executableLoweringPipeline);
+          }
+          case IREE::Codegen::DispatchLoweringPassPipeline::CPUDataTiling: {
+            TilingConfig tilingConfig = getTilingConfigForPipeline(variantOp);
+            addCPUDataTilingPipeline(executableLoweringPipeline, tilingConfig);
             break;
+          }
           case IREE::Codegen::DispatchLoweringPassPipeline::VMVXDefault:
             addVMVXDefaultPassPipeline(executableLoweringPipeline,
                                        enableMicrokernels);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h
index 467bcef..4fcfc21 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h
@@ -18,6 +18,8 @@
 namespace mlir {
 namespace iree_compiler {
 
+class TilingConfig;
+
 /// Performs the final conversion to LLVM dialect.
 std::unique_ptr<OperationPass<ModuleOp>> createConvertToLLVMPass(
     bool reassociateFpReordering = false);
@@ -116,15 +118,18 @@
 //----------------------------------------------------------------------------//
 // LLVMCPU backend Pass Pipelines.
 //----------------------------------------------------------------------------//
+
 /// Populates the passes to lower linalg ops on buffers. Currenly this
 /// pipeline is only used for dispatches that just copy data from input
 /// interfaces to output interface.
 void addCPUBufferOpsTileAndVectorizePipeline(OpPassManager &passManager,
+                                             TilingConfig &tilingConfig,
                                              bool enableVectorMasking,
                                              bool enableAArch64SSVE = false);
 
 /// Populates the passes to lower ops through data tiling transformations.
-void addCPUDataTilingPipeline(OpPassManager &passManager);
+void addCPUDataTilingPipeline(OpPassManager &passManager,
+                              TilingConfig &tilingConfig);
 
 /// Populates the passes to lower to scalars operations for linalg based
 /// code-generation. This pipeline does not vectorize, but instead just
@@ -132,22 +137,23 @@
 void addCPUDefaultPassPipeline(OpPassManager &passManager);
 
 void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager,
+                                               TilingConfig &tilingConfig,
                                                bool enableVectorMasking,
                                                bool enableAArch64SSVE = false);
 
 void addDoubleTilingPadExpertPassPipeline(OpPassManager &passManager,
+                                          TilingConfig &tilingConfig,
                                           bool enableVectorMasking);
 
 /// Populates the passes needed to multi level tile, fuse and vectorize
 /// lowering of linalg ops on tensors to vectors operations.
 void addMmt4dTilingExpertPassPipeline(OpPassManager &passManager,
+                                      TilingConfig &tilingConfig,
                                       bool enableMicrokernels);
 
-void addMultiTilingExpertPassPipeline(OpPassManager &passManager,
-                                      int64_t numLevels, bool enablePeeling,
-                                      bool enableVectorMasking,
-                                      bool lowerToAVX2,
-                                      bool enableAArch64SSVE = false);
+void addMultiTilingExpertPassPipeline(
+    OpPassManager &passManager, TilingConfig &tilingConfig, bool enablePeeling,
+    bool enableVectorMasking, bool lowerToAVX2, bool enableAArch64SSVE = false);
 
 void addTensorToVectorsPassPipeline(OpPassManager &passManager,
                                     bool lowerToVectors = true);
@@ -162,13 +168,13 @@
 // Populates the passes needed to do tiling, decomposing, and vectorizing the
 // convolution ops.
 LogicalResult verifyConvTileAndDecomposeExpertConfig(
-    Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
+    Operation *op, TilingConfig &tilingConfig,
     IREE::Codegen::TranslationInfoAttr translationInfo,
     ArrayRef<int64_t> workgroupSize = {});
 
 /// Populates the passes needed to do two-level tile + vectorize of linalg ops.
 LogicalResult verifyDoubleTilingExpertPassPipelineConfig(
-    Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
+    Operation *op, TilingConfig &tilingConfig,
     IREE::Codegen::TranslationInfoAttr translationInfo,
     ArrayRef<int64_t> workgroupSize = {});
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 6c44db8..a3bc8df 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -11,6 +11,7 @@
 #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h"
 #include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"
 #include "iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h"
+#include "iree/compiler/Codegen/LLVMCPU/TileSizeSelection.h"
 #include "iree/compiler/Codegen/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Codegen/VMVX/VMVXPasses.h"
@@ -162,7 +163,7 @@
 }
 
 LogicalResult verifyDoubleTilingExpertPassPipelineConfig(
-    Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
+    Operation *op, TilingConfig &tilingConfig,
     IREE::Codegen::TranslationInfoAttr translationInfo,
     ArrayRef<int64_t> workgroupSize) {
   if (!workgroupSize.empty()) {
@@ -184,10 +185,9 @@
                                 CPUDoubleTilingPadExpert);
   }
 
-  if (loweringConfig.getTileSizes().size() !=
-      static_cast<unsigned>(TilingLevel::NumTileLevels)) {
-    return op->emitOpError("expected three tiling sizes, got ")
-           << loweringConfig.getTileSizes().size();
+  if (tilingConfig.getNumTilingLevels() != 3) {
+    return op->emitOpError("expected three tiling levels, got ")
+           << tilingConfig.getNumTilingLevels();
   }
 
   auto interfaceOp = dyn_cast_or_null<TilingInterface>(op);
@@ -200,36 +200,36 @@
       }
     }
 
-    SmallVector<int64_t> secondLevelTileSizes = loweringConfig.getTileSizeVals(
-        static_cast<unsigned>(TilingLevel::ParallelTiles));
+    SmallVector<int64_t> secondLevelTileSizes =
+        tilingConfig.getVectorParallelSizes();
     for (auto [index, tileSize] : llvm::enumerate(secondLevelTileSizes)) {
       if (tileSize != 0 && !pLoopsSet.contains(index)) {
         return op->emitOpError(
-                   "expected only parallel dims to be set in the "
-                   "second tiling sizes, got ")
+                   "expected only parallel dims to be set in the second tiling "
+                   "level, got ")
                << index << "-th tile size set";
       }
     }
 
-    SmallVector<int64_t> thirdLevelTileSizes = loweringConfig.getTileSizeVals(
-        static_cast<unsigned>(TilingLevel::ReductionTiles));
+    SmallVector<int64_t> thirdLevelTileSizes =
+        tilingConfig.getVectorReductionSizes();
     for (auto [index, tileSize] : llvm::enumerate(thirdLevelTileSizes)) {
       if (tileSize != 0 && pLoopsSet.contains(index)) {
         return op->emitOpError(
-                   "expected only reduction dims to be set in the third "
-                   "tiling sizes, got ")
+                   "expected only reduction dims to be set in the third tiling "
+                   "level, got ")
                << index << "-th tile size set";
       }
     }
   }
 
   // Verify interchange
-  if (!loweringConfig.getTileInterchange().empty()) {
+  if (!tilingConfig.getTileInterchange().empty()) {
     for (auto level : llvm::seq<unsigned>(
-             0, static_cast<unsigned>(
-                    loweringConfig.getTileInterchange().size()))) {
-      auto tileSizes = loweringConfig.getTileSizeVals(level);
-      auto interchange = loweringConfig.getTileInterchangeVals(level);
+             0,
+             static_cast<unsigned>(tilingConfig.getTileInterchange().size()))) {
+      auto tileSizes = tilingConfig.getTileSizes()[level];
+      auto interchange = tilingConfig.getTileInterchangeSizes(level);
       if (!isValidInterchange(interchange, tileSizes.size())) {
         return op->emitOpError("expected [0, ")
                << tileSizes.size()
@@ -239,8 +239,7 @@
   }
 
   // Verify that native vector size is empty.
-  SmallVector<int64_t> nativeVectorSize =
-      loweringConfig.getNativeVectorSizeVals();
+  SmallVector<int64_t> nativeVectorSize = tilingConfig.getNativeVectorSizes();
   if (!nativeVectorSize.empty()) {
     return op->emitOpError("native_vector_size must be empty");
   }
@@ -248,18 +247,17 @@
 }
 
 LogicalResult verifyConvTileAndDecomposeExpertConfig(
-    Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
+    Operation *op, TilingConfig &tilingConfig,
     IREE::Codegen::TranslationInfoAttr translationInfo,
     ArrayRef<int64_t> workgroupSize) {
-  if (loweringConfig.getTileSizes().size() !=
-      static_cast<unsigned>(TilingLevel::NumTileLevels)) {
-    return op->emitOpError("expected three tiling sizes, got ")
-           << loweringConfig.getTileSizes().size();
+  if (tilingConfig.getNumTilingLevels() != 3) {
+    return op->emitOpError("expected three tiling levels, got ")
+           << tilingConfig.getNumTilingLevels();
   }
 
   linalg::LinalgOp linalgOp = cast<linalg::LinalgOp>(op);
   SmallVector<int64_t> shape = linalgOp.getStaticLoopRanges();
-  for (auto sizes : loweringConfig.getTileSizeVals()) {
+  for (auto sizes : tilingConfig.getTileSizes()) {
     for (auto [i, size] : llvm::enumerate(sizes)) {
       if (size == 1) shape[i] = 1;
       if (shape[i] == -1 || size == 0) continue;
@@ -320,6 +318,7 @@
 //===---------------------------------------------------------------------===//
 
 void addCPUBufferOpsTileAndVectorizePipeline(OpPassManager &passManager,
+                                             TilingConfig &tilingConfig,
                                              bool enableVectorMasking,
                                              bool enableAArch64SSVE) {
   addTileAndDistributePasses(passManager);
@@ -328,7 +327,7 @@
   // only.
   OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
   nestedModulePM.addNestedPass<func::FuncOp>(
-      createLLVMCPUTilePass(static_cast<int64_t>(TilingLevel::ParallelTiles)));
+      createLLVMCPUTilePass(tilingConfig.getVectorParallelLevel()));
   nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUPeelPass());
   {
     LLVMCPUVectorizationPassOptions options;
@@ -358,16 +357,18 @@
 }
 
 void addDoubleTilingPadExpertPassPipeline(OpPassManager &passManager,
+                                          TilingConfig &tilingConfig,
                                           bool enableVectorMasking) {
   addTileAndDistributePasses(passManager);
 
   OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
-  nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUTileAndFusePass(
-      static_cast<int64_t>(TilingLevel::ParallelTiles)));
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createLLVMCPUTileAndFusePass(tilingConfig.getVectorParallelLevel()));
   nestedModulePM.addNestedPass<func::FuncOp>(
       createLLVMCPUTensorPadPass(LLVMCPUTensorPadOption::ParallelDims));
+
   nestedModulePM.addNestedPass<func::FuncOp>(
-      createLLVMCPUTilePass(static_cast<int64_t>(TilingLevel::ReductionTiles)));
+      createLLVMCPUTilePass(tilingConfig.getVectorReductionLevel()));
   nestedModulePM.addNestedPass<func::FuncOp>(
       createLLVMCPUTensorPadPass(LLVMCPUTensorPadOption::ReductionDims));
 
@@ -428,11 +429,9 @@
   }
 }
 
-void addMultiTilingExpertPassPipeline(OpPassManager &passManager,
-                                      int64_t numLevels, bool enablePeeling,
-                                      bool enableVectorMasking,
-                                      bool lowerToAVX2,
-                                      bool enableAArch64SSVE) {
+void addMultiTilingExpertPassPipeline(
+    OpPassManager &passManager, TilingConfig &tilingConfig, bool enablePeeling,
+    bool enableVectorMasking, bool lowerToAVX2, bool enableAArch64SSVE) {
   addTileAndDistributePasses(passManager);
 
   OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
@@ -443,19 +442,28 @@
   nestedModulePM.addNestedPass<func::FuncOp>(
       createRematerializeParallelOpsPass());
 
-  for (int64_t i = 1; i < numLevels - 1; ++i) {
-    nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUTileAndFusePass(i));
-    nestedModulePM.addNestedPass<func::FuncOp>(
-        createFuseTensorPadWithConsumerPass());
-    nestedModulePM.addNestedPass<func::FuncOp>(
-        createConcretizePadResultShapePass());
+  SmallVector<int64_t> allFusableLevels(tilingConfig.getFusableLevels());
+  // Apply tile and fuse to all the non-distribution fusable levels. Skip
+  // distribution level as that level has been fused already.
+  if (allFusableLevels.size() > 1) {
+    ArrayRef<int64_t> nonDistFusableLevels(allFusableLevels.begin() + 1,
+                                           allFusableLevels.end());
+    for (int64_t level : nonDistFusableLevels) {
+      nestedModulePM.addNestedPass<func::FuncOp>(
+          createLLVMCPUTileAndFusePass(level));
+      nestedModulePM.addNestedPass<func::FuncOp>(
+          createFuseTensorPadWithConsumerPass());
+      nestedModulePM.addNestedPass<func::FuncOp>(
+          createConcretizePadResultShapePass());
+    }
   }
+
   // Run SplitReductionPass before the final reduction Fuse pass, because
   // SplitReductionPass takes care of banked-tiling.
   nestedModulePM.addNestedPass<func::FuncOp>(
       createLLVMCPUSplitReductionPass(clEnableReassociateFpReductions));
   nestedModulePM.addNestedPass<func::FuncOp>(
-      createLLVMCPUTilePass(numLevels - 1));
+      createLLVMCPUTilePass(tilingConfig.getVectorReductionLevel()));
 
   nestedModulePM.addNestedPass<func::FuncOp>(
       createFuseTensorPadWithConsumerPass());
@@ -504,6 +512,7 @@
 }
 
 void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager,
+                                               TilingConfig &tilingConfig,
                                                bool enableVectorMasking,
                                                bool enableAArch64SSVE) {
   addTileAndDistributePasses(passManager);
@@ -514,15 +523,15 @@
   // get tiled if the case is conv + generic op. In this case, we have to tile
   // along reduction dim again, which needs them to be Linalg ops form.
 
-  nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUTileAndFusePass(
-      static_cast<int64_t>(TilingLevel::ParallelTiles)));
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createLLVMCPUTileAndFusePass(tilingConfig.getVectorParallelLevel()));
   nestedModulePM.addNestedPass<func::FuncOp>(
       createFuseTensorPadWithConsumerPass());
   nestedModulePM.addNestedPass<func::FuncOp>(
       createConcretizePadResultShapePass());
 
   nestedModulePM.addNestedPass<func::FuncOp>(
-      createLLVMCPUTilePass(static_cast<int64_t>(TilingLevel::ReductionTiles)));
+      createLLVMCPUTilePass(tilingConfig.getVectorReductionLevel()));
   nestedModulePM.addNestedPass<func::FuncOp>(
       createDecomposeConvolutionToLowerDimOpsPass());
 
@@ -567,6 +576,7 @@
 }
 
 void addMmt4dTilingExpertPassPipeline(OpPassManager &passManager,
+                                      TilingConfig &tilingConfig,
                                       bool enableMicrokernels) {
   addTileAndDistributePasses(passManager);
 
@@ -576,9 +586,9 @@
     nestedModulePM.addPass(createLLVMCPULowerToUKernelsPass());
   } else {
     nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUTileAndFusePass(
-        static_cast<int64_t>(TilingLevel::ParallelTiles)));
+        static_cast<int64_t>(tilingConfig.getVectorParallelLevel())));
     nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUTilePass(
-        static_cast<int64_t>(TilingLevel::ReductionTiles)));
+        static_cast<int64_t>(tilingConfig.getVectorReductionLevel())));
     nestedModulePM.addNestedPass<func::FuncOp>(
         createLLVMCPUVectorizationPass());
   }
@@ -594,11 +604,12 @@
   }
 }
 
-void addCPUDataTilingPipeline(OpPassManager &passManager) {
+void addCPUDataTilingPipeline(OpPassManager &passManager,
+                              TilingConfig &tilingConfig) {
   addTileAndDistributePasses(passManager);
   OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
   nestedModulePM.addNestedPass<func::FuncOp>(
-      createLLVMCPUTilePass(static_cast<int64_t>(TilingLevel::ParallelTiles)));
+      createLLVMCPUTilePass(tilingConfig.getVectorParallelLevel()));
   nestedModulePM.addNestedPass<func::FuncOp>(
       createDecomposePackUnPackOpsPass());
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/TileSizeSelection.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/TileSizeSelection.cpp
new file mode 100644
index 0000000..5f597fd
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/TileSizeSelection.cpp
@@ -0,0 +1,96 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/LLVMCPU/TileSizeSelection.h"
+
+using mlir::iree_compiler::IREE::Codegen::LoweringConfigAttr;
+
+namespace mlir {
+namespace iree_compiler {
+
+TilingConfig::TilingConfig(IREE::Codegen::LoweringConfigAttr lc)
+    : loweringConfig(lc) {
+  assert(lc && "Expected a valid lowering config");
+
+  // Initialize indices to invalid.
+  std::fill(tilingLevelToActualLevelMap.begin(),
+            tilingLevelToActualLevelMap.end(), InvalidLevel);
+
+  // Map the tiling levels that are defined in the actual configuration to
+  // their corresponding incremental levels. We currently support the following
+  // scenarios:
+  //   1. [[distribution]]
+  //   2. [[distribution], [vector-parallel]]
+  //   3. [[distribution], [vector-parallel], [vector-reduction]]
+  //   4. [[distribution], [cache-parallel], [cache-reduction],
+  //       [vector-parallel], [vector-reduction]]
+  int numTileLevels = loweringConfig.getTileSizes().size();
+  switch (numTileLevels) {
+    case 3:
+      tilingLevelToActualLevelMap[VectorReductionTiles] = 2;
+      [[fallthrough]];
+    case 2:
+      tilingLevelToActualLevelMap[VectorParallelTiles] = 1;
+      [[fallthrough]];
+    case 1:
+      tilingLevelToActualLevelMap[DistributionTiles] = 0;
+      break;
+    case MaxNumTileLevels:
+      for (int i = 0; i < MaxNumTileLevels; ++i) {
+        tilingLevelToActualLevelMap[i] = i;
+      }
+      break;
+    default:
+      break;
+  }
+};
+
+/// Returns the tile sizes of all the vector dimensions, including parallel
+/// and reduction dimensions.
+SmallVector<int64_t> TilingConfig::getVectorTileSizes() {
+  unsigned numDims = getNumDimensions();
+  SmallVector<int64_t> vectorSizes(numDims);
+  SmallVector<int64_t> parallelSizes = getVectorParallelSizes();
+  SmallVector<int64_t> reductionSizes = getVectorReductionSizes();
+  for (int i = 0; i < numDims; ++i) {
+    vectorSizes[i] =
+        parallelSizes[i] != 0 ? parallelSizes[i] : reductionSizes[i];
+  }
+
+  return vectorSizes;
+}
+
+/// Returns a list with the tiling levels that can be fused for this
+/// configuration.
+SmallVector<int64_t> TilingConfig::getFusableLevels() {
+  switch (getNumTilingLevels()) {
+    case 0:
+      return {};
+    case 1:
+      // Only distribution level.
+      return {0};
+    case 3:
+      // Distribution + vector parallel levels.
+      return {0, 1};
+    case 5:
+      // Distribution + cache parallel levels.
+      return {0, 1};
+    default:
+      llvm_unreachable("Unexpected number of tiling levels");
+  }
+}
+
+/// Returns the actual level in the configuration for this level of tiling.
+unsigned TilingConfig::getActualLevel(TilingLevel level) {
+  assert(level < InvalidLevel && "Unexpected invalid tiling level");
+  unsigned actualLevel = tilingLevelToActualLevelMap[level];
+  assert(actualLevel != InvalidLevel &&
+         "Searching for unavailable tiling level");
+  return actualLevel;
+}
+
+}  // namespace iree_compiler
+}  // namespace mlir
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/TileSizeSelection.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/TileSizeSelection.h
new file mode 100644
index 0000000..90c36b5
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/TileSizeSelection.h
@@ -0,0 +1,129 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_CODEGEN_LLVMCPU_TILESIZESELECTION_H_
+#define IREE_COMPILER_CODEGEN_LLVMCPU_TILESIZESELECTION_H_
+
+#include "iree/compiler/Codegen/Dialect/LoweringConfig.h"
+#include "mlir/IR/BuiltinOps.h"
+
+namespace mlir {
+namespace iree_compiler {
+
+/// Provides unified API to get access to all the tile size needed during the
+/// CPU lowering process, while abstracting the representation and verification
+/// details of such information in the IR.
+///
+/// We currently support the following scenarios:
+///   1. [[distribution]]
+///   2. [[distribution], [vector-parallel]]
+///   3. [[distribution], [vector-parallel], [vector-reduction]]
+///   4. [[distribution], [cache-parallel], [cache-reduction],
+///       [vector-parallel], [vector-reduction]]
+class TilingConfig {
+ public:
+  TilingConfig(IREE::Codegen::LoweringConfigAttr lc);
+
+  /// Returns the number of tiling levels of the configuration.
+  unsigned getNumTilingLevels() {
+    return loweringConfig.getTileSizes().size();
+  };
+
+  /// Returns the number of dimensions of the configuration. All the tiling
+  /// levels must have the same number of dimensions.
+  unsigned getNumDimensions() { return getDistributionTileSizes().size(); }
+
+  /// Returns the number of parallel dimensions to tile at vector level.
+  unsigned getNumVectorParallelTiles() {
+    return llvm::count_if(getVectorParallelSizes(),
+                          [](int64_t tileSize) { return tileSize != 0; });
+  }
+
+  /// Returns the tiling level for cache parallel dimensions.
+  unsigned getCacheParallelLevel() {
+    return getActualLevel(CacheParallelTiles);
+  }
+
+  /// Returns the tiling level for cache reduction dimensions.
+  unsigned getCacheReductionLevel() {
+    return getActualLevel(CacheReductionTiles);
+  }
+
+  /// Returns the tiling level for vector parallel dimensions.
+  unsigned getVectorParallelLevel() {
+    return getActualLevel(VectorParallelTiles);
+  }
+
+  /// Returns the tiling level for vector parallel dimensions.
+  unsigned getVectorReductionLevel() {
+    return getActualLevel(VectorReductionTiles);
+  }
+
+  /// Returns all the tile sizes of all the levels of the configuration.
+  TileSizesListType getTileSizes() { return loweringConfig.getTileSizeVals(); }
+
+  /// Returns the distribution tile sizes of the configuration.
+  SmallVector<int64_t> getDistributionTileSizes() {
+    return loweringConfig.getTileSizeVals(getActualLevel(DistributionTiles));
+  }
+
+  SmallVector<int64_t> getCacheReductionSizes() {
+    return loweringConfig.getTileSizeVals(getCacheReductionLevel());
+  }
+
+  SmallVector<int64_t> getVectorParallelSizes() {
+    return loweringConfig.getTileSizeVals(getVectorParallelLevel());
+  }
+
+  SmallVector<int64_t> getVectorReductionSizes() {
+    return loweringConfig.getTileSizeVals(getVectorReductionLevel());
+  }
+
+  /// Returns the tile sizes of all the vector dimensions, including parallel
+  /// and reduction dimensions.
+  SmallVector<int64_t> getVectorTileSizes();
+
+  /// Returns a list with the tiling levels that can be fused for this
+  /// configuration.
+  SmallVector<int64_t> getFusableLevels();
+
+  // TODO(dcaballe): Revisit if these features are ever used.
+  ArrayAttr getTileInterchange() { return loweringConfig.getTileInterchange(); }
+  SmallVector<int64_t> getTileInterchangeSizes(unsigned level) {
+    return loweringConfig.getTileInterchangeVals(level);
+  }
+  SmallVector<int64_t> getNativeVectorSizes() {
+    return loweringConfig.getNativeVectorSizeVals();
+  }
+
+ private:
+  /// Internal representation for all the supported tiling levels. All or just
+  /// a subset of them may be available in a valid configuration.
+  enum TilingLevel : unsigned {
+    DistributionTiles = 0,
+    CacheParallelTiles = 1,
+    CacheReductionTiles = 2,
+    VectorParallelTiles = 3,
+    VectorReductionTiles = 4,
+    MaxNumTileLevels = 5,
+    InvalidLevel = 6,
+  };
+
+  /// Returns the actual level in the configuration for this level of tiling.
+  unsigned getActualLevel(TilingLevel level);
+
+  /// Holds the lowering config that provides the configuration.
+  IREE::Codegen::LoweringConfigAttr loweringConfig;
+
+  /// Maps `TilingLevel`'s to the actual number of levels in this configuration.
+  std::array<unsigned, TilingLevel::MaxNumTileLevels>
+      tilingLevelToActualLevelMap;
+};
+
+}  // namespace iree_compiler
+}  // namespace mlir
+
+#endif  // IREE_COMPILER_CODEGEN_LLVMCPU_TILESIZESELECTION_H_
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir
index 0418ac9..2f48853 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir
@@ -18,7 +18,7 @@
         %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
         %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
         %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
-        // expected-error @+1 {{expected three tiling sizes, got 0}}
+        // expected-error @+1 {{expected three tiling levels, got 0}}
         linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
           outs(%result: memref<4x16xf32>)
         return
@@ -76,7 +76,7 @@
         %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
         %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
         %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
-        // expected-error @+1 {{expected only parallel dims to be set in the second tiling sizes, got 2-th tile size set}}
+        // expected-error @+1 {{expected only parallel dims to be set in the second tiling level, got 2-th tile size set}}
         linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
           outs(%result: memref<4x16xf32>)
         return
@@ -105,7 +105,7 @@
         %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
         %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
         %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
-        // expected-error @+1 {{only reduction dims to be set in the third tiling sizes, got 1-th tile size set}}
+        // expected-error @+1 {{only reduction dims to be set in the third tiling level, got 1-th tile size set}}
         linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
           outs(%result: memref<4x16xf32>)
         return
@@ -165,7 +165,7 @@
         %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
         %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
         %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
-        // expected-error @+1 {{expected only parallel dims to be set in the second tiling sizes, got 2-th tile size set}}
+        // expected-error @+1 {{expected only parallel dims to be set in the second tiling level, got 2-th tile size set}}
         linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
           outs(%result: memref<4x16xf32>)
         return
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
index a95fa96..566cea6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
@@ -628,16 +628,22 @@
 hal.executable private @aarch64_ssve__cpu_buffer_ops_tile_and_vectorize {
   hal.executable.variant public @embedded_elf_arm_64, target = #executable_target_embedded_elf_arm_64_ {
     hal.executable.export public @dispatch ordinal(0) layout(#pipeline_layout) attributes {
+      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0], [1], [0]]>,
       translation_info = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize>
     } {
     ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
       hal.return %arg1, %arg2, %arg2 : index, index, index
     }
     builtin.module {
-      func.func @dispatch() { 
+      func.func @dispatch() {
+        %c0 = arith.constant 0 : index
+        %c1 = arith.constant 1 : index
         %cst_0 = arith.constant 0.000000e+00 : f32
-        %0 = tensor.empty() : tensor<1xf32>
-        %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<1xf32>) -> tensor<1xf32>
+        %0 = hal.interface.constant.load[0] : i32
+        %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1xf32>>
+        %7 = tensor.empty() : tensor<1xf32>
+        %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
+        flow.dispatch.tensor.store %8, %6, offsets = [0], sizes = [1], strides = [1] : tensor<1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1xf32>>
         return
       }
     }
@@ -650,16 +656,22 @@
 hal.executable private @aarch64_ssve__cpu_double_tiling_peeling_expert {
   hal.executable.variant public @embedded_elf_arm_64, target = #executable_target_embedded_elf_arm_64_ {
     hal.executable.export public @dispatch ordinal(0) layout(#pipeline_layout) attributes {
+      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0], [1], [0]]>,
       translation_info = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
     } {
     ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
       hal.return %arg1, %arg2, %arg2 : index, index, index
     }
     builtin.module {
-      func.func @dispatch() { 
+      func.func @dispatch() {
+        %c0 = arith.constant 0 : index
+        %c1 = arith.constant 1 : index
         %cst_0 = arith.constant 0.000000e+00 : f32
-        %0 = tensor.empty() : tensor<1xf32>
-        %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<1xf32>) -> tensor<1xf32>
+        %0 = hal.interface.constant.load[0] : i32
+        %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1xf32>>
+        %7 = tensor.empty() : tensor<1xf32>
+        %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
+        flow.dispatch.tensor.store %8, %6, offsets = [0], sizes = [1], strides = [1] : tensor<1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1xf32>>
         return
       }
     }
@@ -672,6 +684,7 @@
 hal.executable private @aarch64_ssve__cpu_conv_tile_and_decompose_expert {
   hal.executable.variant public @embedded_elf_arm_64, target = #executable_target_embedded_elf_arm_64_ {
     hal.executable.export public @dispatch ordinal(0) layout(#pipeline_layout) attributes {
+      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0], [1], [0]]>,
       translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
     } {
     ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
@@ -679,9 +692,14 @@
     }
     builtin.module {
       func.func @dispatch() {
+        %c0 = arith.constant 0 : index
+        %c1 = arith.constant 1 : index
         %cst_0 = arith.constant 0.000000e+00 : f32
-        %0 = tensor.empty() : tensor<1xf32>
-        %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<1xf32>) -> tensor<1xf32>
+        %0 = hal.interface.constant.load[0] : i32
+        %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1xf32>>
+        %7 = tensor.empty() : tensor<1xf32>
+        %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
+        flow.dispatch.tensor.store %8, %6, offsets = [0], sizes = [1], strides = [1] : tensor<1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1xf32>>
         return
       }
     }
@@ -704,6 +722,7 @@
 hal.executable private @aarch64_ssve_sve_disabled {
   hal.executable.variant public @embedded_elf_arm_64, target = #executable_target_embedded_elf_arm_64_no_sve {
     hal.executable.export public @dispatch ordinal(0) layout(#pipeline_layout) attributes {
+      lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0], [1], [0]]>,
       translation_info = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize>
     } {
     ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
@@ -711,9 +730,14 @@
     }
     builtin.module {
       func.func @dispatch() {
+        %c0 = arith.constant 0 : index
+        %c1 = arith.constant 1 : index
         %cst_0 = arith.constant 0.000000e+00 : f32
-        %0 = tensor.empty() : tensor<1xf32>
-        %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<1xf32>) -> tensor<1xf32>
+        %0 = hal.interface.constant.load[0] : i32
+        %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1xf32>>
+        %7 = tensor.empty() : tensor<1xf32>
+        %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
+        flow.dispatch.tensor.store %8, %6, offsets = [0], sizes = [1], strides = [1] : tensor<1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1xf32>>
         return
       }
     }
diff --git a/compiler/src/iree/compiler/Codegen/Passes.cpp b/compiler/src/iree/compiler/Codegen/Passes.cpp
index ac725e1..2656a06 100644
--- a/compiler/src/iree/compiler/Codegen/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/Passes.cpp
@@ -75,25 +75,6 @@
       });
 }
 
-/// Hook to verify the lowering configuration and translation info for an
-/// operation.
-LogicalResult verifyLoweringConfiguration(
-    Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
-    IREE::Codegen::TranslationInfoAttr translationInfo,
-    ArrayRef<int64_t> workgroupSize) {
-  switch (translationInfo.getDispatchLoweringPassPipeline()) {
-    case IREE::Codegen::DispatchLoweringPassPipeline::Mmt4dTilingExpert:
-      return verifyDoubleTilingExpertPassPipelineConfig(op, loweringConfig,
-                                                        translationInfo);
-    case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUMatmulSimt:
-      return verifyGPUMatmulPipeline(op, loweringConfig, translationInfo,
-                                     workgroupSize);
-    default:
-      break;
-  }
-  return success();
-}
-
 void addCommonTargetExecutablePreprocessingPasses(OpPassManager &passManager) {
   passManager.addNestedPass<func::FuncOp>(createTypePropagationPass());
   passManager.addPass(createBubbleUpOrdinalOpsPass());