Merge google -> main (#7667) * bb1e2a751 Synchronize submodules with LLVM at llvm/llvm-project@8909dc5ebe8a * 206b858d2 Run bazel_to_cmake on iree/compiler/Codegen/Common/. * e3e22e769 Merge pull request #7664 from GMNGeoffrey:main-to-google * d4725075d Integrate LLVM at llvm/llvm-project@8909dc5ebe8a * 1c01a25cf Integrate LLVM at llvm/llvm-project@4602f52d482c * cae7b77d2 Migrate android.support.annotation.NonNull to androidx. * a4cd45079 Integrate LLVM at llvm/llvm-project@f46f93b47863 * 9685f4c90 Integrate LLVM at llvm/llvm-project@1d7fdbbc183a * 5446edced Integrate LLVM at llvm/llvm-project@42102bce98e5

diff --git a/integrations/tensorflow/iree_tf_compiler/TFL/BUILD b/integrations/tensorflow/iree_tf_compiler/TFL/BUILD
index 67079cd..bfff2f9 100644
--- a/integrations/tensorflow/iree_tf_compiler/TFL/BUILD
+++ b/integrations/tensorflow/iree_tf_compiler/TFL/BUILD

@@ -56,6 +56,7 @@
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:ReconcileUnrealizedCasts",
         "@llvm-project//mlir:Shape",
         "@llvm-project//mlir:ShapeTransforms",
         "@llvm-project//mlir:StandardOps",

diff --git a/integrations/tensorflow/iree_tf_compiler/TFL/LowerGlobalTensors.cpp b/integrations/tensorflow/iree_tf_compiler/TFL/LowerGlobalTensors.cpp
index 67930ce..d5f8ddf 100644
--- a/integrations/tensorflow/iree_tf_compiler/TFL/LowerGlobalTensors.cpp
+++ b/integrations/tensorflow/iree_tf_compiler/TFL/LowerGlobalTensors.cpp

@@ -10,6 +10,9 @@
 #include "iree/compiler/Utils/ConversionUtils.h"
 #include "iree_tf_compiler/TFL/PassDetail.h"
 #include "iree_tf_compiler/TFL/Passes.h"
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -24,7 +27,7 @@
     : public LowerGlobalTensorsBase<LowerGlobalTensorsPass> {
  public:
   void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<mlir::TFL::TensorFlowLiteDialect,
+    registry.insert<mlir::TFL::TensorFlowLiteDialect, tosa::TosaDialect,
                     iree_compiler::IREE::Util::UtilDialect>();
   }
 
@@ -49,14 +52,29 @@
       // Look through the initialization functions and find the assigned values
       // for each handle, save out the constant value.
       for (auto init : func.getOps<mlir::TFL::CallOnceOp>()) {
-        FuncOp initFunc = symNameToFunction[init.session_init_function()];
+        auto findInitFunc =
+            symNameToFunction.find(init.session_init_function());
+        if (findInitFunc == symNameToFunction.end()) {
+          init.emitError("Unable to find initialization function: " +
+                         init.session_init_function());
+          continue;
+        }
+        FuncOp initFunc = std::get<1>(*findInitFunc);
         for (auto assign : initFunc.getOps<mlir::TFL::AssignVariableOp>()) {
           auto handle = dyn_cast<mlir::TFL::VarHandleOp>(
               assign.resource_id().getDefiningOp());
           if (!handle) continue;
 
           DenseElementsAttr constant;
-          if (!matchPattern(assign.value(), m_Constant(&constant))) continue;
+          if (!matchPattern(assign.value(), m_Constant(&constant))) {
+            // Quantized types we can not use the m_Constant matcher.
+            if (auto constOp = dyn_cast<mlir::TFL::QConstOp>(
+                    assign.value().getDefiningOp())) {
+              constant = constOp.value().cast<DenseElementsAttr>();
+            }
+          }
+          if (!constant) continue;
+
           auto name = handle.shared_name();
           sharedNameToConstant[name] = constant;
           sharedNameToLoc[name] = handle.getLoc();
@@ -131,8 +149,19 @@
       if (!address) continue;
 
       builder.setInsertionPoint(assign);
+      Value value = assign.value();
+      Type storageType = address.getType()
+                             .cast<iree_compiler::IREE::Util::PtrType>()
+                             .getTargetType();
+      if (storageType != value.getType()) {
+        value = builder
+                    .create<UnrealizedConversionCastOp>(assign.getLoc(),
+                                                        storageType, value)
+                    .getResult(0);
+      }
+
       builder.create<iree_compiler::IREE::Util::GlobalStoreIndirectOp>(
-          assign.getLoc(), assign.value(), assign.resource_id());
+          assign.getLoc(), value, assign.resource_id());
       assign.erase();
     }
 
@@ -149,9 +178,15 @@
       auto type = ptrType.getTargetType();
 
       builder.setInsertionPoint(read);
-      auto load =
+      Value load =
           builder.create<iree_compiler::IREE::Util::GlobalLoadIndirectOp>(
               read.getLoc(), type, read.resource_id());
+      if (type != read.getResult().getType()) {
+        load = builder
+                   .create<UnrealizedConversionCastOp>(
+                       read.getLoc(), read.getResult().getType(), load)
+                   .getResult(0);
+      }
       read.getResult().replaceAllUsesWith(load);
       read.erase();
     }

diff --git a/integrations/tensorflow/iree_tf_compiler/TFL/Passes.cpp b/integrations/tensorflow/iree_tf_compiler/TFL/Passes.cpp
index ef90c13..3d18060 100644
--- a/integrations/tensorflow/iree_tf_compiler/TFL/Passes.cpp
+++ b/integrations/tensorflow/iree_tf_compiler/TFL/Passes.cpp

@@ -7,6 +7,7 @@
 #include "iree_tf_compiler/TFL/Passes.h"
 
 #include "iree/compiler/Dialect/Shape/Transforms/Passes.h"
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Dialect/Shape/Transforms/Passes.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
@@ -56,6 +57,7 @@
   mlir::tosa::createTFTFLtoTOSALegalizationPipeline(pm, tosaOptions);
   pm.nest<FuncOp>().addPass(mlir::tosa::createStripQuantTypesPass());
   pm.addPass(createCanonicalizerPass());
+  pm.addPass(createReconcileUnrealizedCastsPass());
 
   //----------------------------------------------------------------------------
   // Lowering shape-related constructs

diff --git a/integrations/tensorflow/iree_tf_compiler/TFL/VerifyFullyConverted.cpp b/integrations/tensorflow/iree_tf_compiler/TFL/VerifyFullyConverted.cpp
index 3a6a782..1e46707 100644
--- a/integrations/tensorflow/iree_tf_compiler/TFL/VerifyFullyConverted.cpp
+++ b/integrations/tensorflow/iree_tf_compiler/TFL/VerifyFullyConverted.cpp

@@ -28,6 +28,7 @@
     ConversionTarget target(getContext());
     target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
     target.addIllegalDialect<mlir::TFL::TensorFlowLiteDialect>();
+    target.addIllegalOp<mlir::UnrealizedConversionCastOp>();
     if (failed(
             iree_compiler::verifyAllOperationsAreLegal(getOperation(), target)))
       return signalPassFailure();

diff --git a/integrations/tensorflow/iree_tf_compiler/TFL/test/lower_global_tensors.mlir b/integrations/tensorflow/iree_tf_compiler/TFL/test/lower_global_tensors.mlir
index e9d8668..15a4b94 100644
--- a/integrations/tensorflow/iree_tf_compiler/TFL/test/lower_global_tensors.mlir
+++ b/integrations/tensorflow/iree_tf_compiler/TFL/test/lower_global_tensors.mlir

@@ -1,9 +1,8 @@
 // RUN: iree-opt-tflite -split-input-file -allow-unregistered-dialect -pass-pipeline='iree-tflite-lower-global-tensors' %s | IreeFileCheck %s
 
-// CHECK-LABEL: module {
 module {
   // CHECK: util.global private mutable @__iree_flow_Variable = dense<1.000000e+00> : tensor<16x16xf32>
-  // CHECK: func @state
+  // CHECK-LABEL: func @state
   func @state(%arg0: tensor<16x16xf32>) -> () {
     "tfl.call_once"() {session_init_function = "StateInit"} : () -> ()
     return
@@ -19,11 +18,10 @@
 
 // -----
 
-// CHECK-LABEL: module {
 module {
   // CHECK: util.global private mutable @__iree_flow_Variable = dense<1.000000e+00> : tensor<16x16xf32>
 
-  // CHECK: func @assign
+  // CHECK-LABEL: func @assign
   func @assign(%arg0: tensor<16x16xf32>) -> () {
     "tfl.call_once"() {session_init_function = "AssignInit"} : () -> ()
     // CHECK: %[[ADDR:.+]] = util.global.address @__iree_flow_Variable : !util.ptr<tensor<16x16xf32>>
@@ -44,11 +42,10 @@
 
 // -----
 
-// CHECK-LABEL: module {
 module {
   // CHECK: util.global private mutable @__iree_flow_Variable = dense<1.000000e+00> : tensor<16x16xf32>
 
-  // CHECK: func @read
+  // CHECK-LABEL: func @read
   func @read(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>) {
     "tfl.call_once"() {session_init_function = "ReadInit"} : () -> ()
 
@@ -70,11 +67,10 @@
 
 // -----
 
-// CHECK-LABEL: module {
 module {
   // CHECK: util.global private mutable @__iree_flow_Variable = dense<2.000000e+00> : tensor<16x16xf32>
 
-  // func @readAssign
+  // CHECK-LABEL: func @readAssign
   func @readAssign(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>) {
     "tfl.call_once"() {session_init_function = "ReadAssignInit"} : () -> ()
     // CHECK: %[[ADDR:.+]] = util.global.address @__iree_flow_Variable : !util.ptr<tensor<16x16xf32>>
@@ -101,6 +97,35 @@
 // -----
 
 module {
+  // CHECK: util.global private mutable @__iree_flow_Variable = dense<42> : tensor<2x3xi8>
+  // CHECK-LABEL: func @readAssignQuant
+  func @readAssignQuant(%arg0: tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) -> (tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) {
+    "tfl.call_once"() {session_init_function = "ReadAssignInit"} : () -> ()
+    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+
+    // CHECK: %[[ADDR:.+]] = util.global.load.indirect %ptr___iree_flow_Variable : !util.ptr<tensor<2x3xi8>> -> tensor<2x3xi8>
+    // CHECK: %[[CAST:.+]] = builtin.unrealized_conversion_cast %[[ADDR]] : tensor<2x3xi8> to tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>
+    %1 = "tfl.read_variable"(%0) : (tensor<*x!tf_type.resource>) -> tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
+
+    // CHECK: %[[ADD:.+]] = tfl.add %[[CAST]], %arg0 {fused_activation_function = "NONE"}
+    %2 = tfl.add %1, %arg0 {fused_activation_function = "NONE"} : tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
+
+    // CHECK: %[[CAST2:.+]] = builtin.unrealized_conversion_cast %[[ADD]] : tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>> to tensor<2x3xi8>
+    // CHECK: util.global.store.indirect %[[CAST2]], %ptr___iree_flow_Variable
+    "tfl.assign_variable"(%0, %2) : (tensor<*x!tf_type.resource>, tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) -> ()
+    return %2 : tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
+  }
+  func private @ReadAssignInit() {
+    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+    %1 = "tfl.pseudo_const"() {qtype = tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>, value = dense<42> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
+    "tfl.assign_variable"(%0, %1) : (tensor<*x!tf_type.resource>, tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) -> ()
+    return
+  }
+}
+
+// -----
+
+module {
   // CHECK-label: @nostate
   func @nostate(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>) {
     "tfl.call_once"() {session_init_function = "NoStateInit"} : () -> ()

diff --git a/iree/compiler/Codegen/BUILD b/iree/compiler/Codegen/BUILD
index 8a91e56..175001a 100644
--- a/iree/compiler/Codegen/BUILD
+++ b/iree/compiler/Codegen/BUILD

@@ -34,6 +34,7 @@
     ],
     deps = [
         ":PassesIncGen",
+        "//iree/compiler/Codegen/Dialect:IREECodegenDialect",
         "//iree/compiler/Dialect/HAL/IR",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:Pass",
@@ -50,6 +51,7 @@
         ":PassHeaders",
         ":PassesIncGen",
         "//iree/compiler/Codegen/Common",
+        "//iree/compiler/Codegen/Dialect:IREECodegenDialect",
         "//iree/compiler/Codegen/LLVMCPU",
         "//iree/compiler/Codegen/LLVMGPU",
         "//iree/compiler/Codegen/SPIRV",

diff --git a/iree/compiler/Codegen/CMakeLists.txt b/iree/compiler/Codegen/CMakeLists.txt
index dc25c35..0fcd93d 100644
--- a/iree/compiler/Codegen/CMakeLists.txt
+++ b/iree/compiler/Codegen/CMakeLists.txt

@@ -31,6 +31,7 @@
     MLIRLinalgTransforms
     MLIRPass
     MLIRTransforms
+    iree::compiler::Codegen::Dialect::IREECodegenDialect
     iree::compiler::Dialect::HAL::IR
   PUBLIC
 )
@@ -44,6 +45,7 @@
     ::PassHeaders
     ::PassesIncGen
     iree::compiler::Codegen::Common
+    iree::compiler::Codegen::Dialect::IREECodegenDialect
     iree::compiler::Codegen::LLVMCPU
     iree::compiler::Codegen::LLVMGPU
     iree::compiler::Codegen::SPIRV

diff --git a/iree/compiler/Codegen/Dialect/LoweringConfig.td b/iree/compiler/Codegen/Dialect/LoweringConfig.td
index 62124cf..95251a1 100644
--- a/iree/compiler/Codegen/Dialect/LoweringConfig.td
+++ b/iree/compiler/Codegen/Dialect/LoweringConfig.td

@@ -12,8 +12,6 @@
 // List of pre-existing pipelines for translating executables.
 def CPU_Default
     : StrEnumAttrCase<"CPUDefault">;
-def CPU_Vectorization
-    : StrEnumAttrCase<"CPUVectorization">;
 def CPU_TensorToVectors
     : StrEnumAttrCase<"CPUTensorToVectors">;
 def CPU_TileFuseAndVectorize
@@ -42,8 +40,8 @@
     "DispatchLoweringPassPipeline",
     "identifier for pass pipeline use to lower dispatch region",
     [CPU_Default, CPU_TensorToVectors, CPU_TileFuseAndVectorize,
-     CPU_Vectorization, LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize,
-     LLVMGPU_MatmulSimt, SPIRV_SimpleDistribute, SPIRV_Vectorize,
+     LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize, LLVMGPU_MatmulSimt,
+     SPIRV_SimpleDistribute, SPIRV_Vectorize,
      SPIRV_VectorizeToCooperativeOps, None]> {
   let cppNamespace = "::mlir::iree_compiler::IREE::Codegen";
 }

diff --git a/iree/compiler/Codegen/LLVMCPU/BUILD b/iree/compiler/Codegen/LLVMCPU/BUILD
index 368243a..97c33b6 100644
--- a/iree/compiler/Codegen/LLVMCPU/BUILD
+++ b/iree/compiler/Codegen/LLVMCPU/BUILD

@@ -20,7 +20,6 @@
         "LLVMCPUTileAndVectorizeLinalgTensorOps.cpp",
         "LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp",
         "LLVMCPUUnfuseFMAOps.cpp",
-        "LLVMCPUVectorization.cpp",
         "Passes.cpp",
         "VectorContractToAArch64InlineAsmOp.cpp",
     ],

diff --git a/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
index 4e7ff7a..d2061de 100644
--- a/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
+++ b/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt

@@ -23,7 +23,6 @@
     "LLVMCPUTileAndVectorizeLinalgTensorOps.cpp"
     "LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp"
     "LLVMCPUUnfuseFMAOps.cpp"
-    "LLVMCPUVectorization.cpp"
     "Passes.cpp"
     "VectorContractToAArch64InlineAsmOp.cpp"
   DEPS

diff --git a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index f6bef55..9f794ac 100644
--- a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp

@@ -304,6 +304,9 @@
         lb.getInt(), ub.getInt(),
         workloadPerWorkgroup[tiledLoops.size() - 1 - i], vectorSizeVals[i]);
   }
+  if (isBatchMatmul) {
+    workloadPerWorkgroup.push_back(1);
+  }
   setTranslationInfo(
       entryPointFn,
       clUseTileFuseAndVectorize

diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
index 0c62f13..72078cd 100644
--- a/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp

@@ -92,6 +92,21 @@
   return input;
 }
 
+/// Verify that valid configuration is set for all ops within the compiled
+/// module.
+template <typename F>
+static LogicalResult verifyLoweringConfiguration(
+    ModuleOp module, IREE::Codegen::TranslationInfoAttr translationInfo,
+    F verificationFn) {
+  auto walkResult = module.walk([&](Operation *op) -> WalkResult {
+    IREE::Codegen::LoweringConfigAttr loweringConfig = getLoweringConfig(op);
+    if (!loweringConfig) return WalkResult::advance();
+    return verificationFn(op, loweringConfig, translationInfo,
+                          ArrayRef<int64_t>{});
+  });
+  return failure(walkResult.wasInterrupted());
+}
+
 void LLVMCPULowerExecutableTargetPass::runOnOperation() {
   IREE::HAL::ExecutableVariantOp variantOp = getOperation();
   ModuleOp moduleOp = variantOp.getInnerModule();
@@ -118,55 +133,67 @@
     }
 
     // There might be multiple entry points in the module. Currently, all of
-    // them need to have the same pipeline.
+    // them need to have the same translation info.
     // TODO(ravishankarm): This is strange that this is not enforced
-    // structurally, but something to address later on. For now this restriction
+    // structurally, but something to address later on. The main issue is how
+    // to invoke separate dynamic pass pipelines on  entry point functions, when
+    // the passes might have module level changes. For now this restriction
     // is fine.
     llvm::StringMap<IREE::HAL::ExecutableEntryPointOp> entryPoints =
         getAllEntryPoints(moduleOp);
-    Optional<IREE::Codegen::DispatchLoweringPassPipeline> passPipeline;
+    Optional<IREE::Codegen::TranslationInfoAttr> translationInfo;
     for (auto &it : entryPoints) {
       auto entryPointOp = it.second;
-      if (IREE::Codegen::TranslationInfoAttr translationInfo =
+      if (IREE::Codegen::TranslationInfoAttr currTranslationInfo =
               getTranslationInfo(entryPointOp)) {
-        IREE::Codegen::DispatchLoweringPassPipeline currPipeline =
-            translationInfo.getDispatchLoweringPassPipeline();
-        if (passPipeline) {
-          if (currPipeline != passPipeline.getValue()) {
-            moduleOp.emitError(
-                "unhandled compilation of entry point function with different "
-                "pass pipelines within a module");
-            return signalPassFailure();
+        if (translationInfo) {
+          if (currTranslationInfo != translationInfo.getValue()) {
+            moduleOp.emitOpError(
+                "unhandled compilation of entry point functions with different "
+                "translation info");
           }
-          continue;
+        } else {
+          translationInfo = currTranslationInfo;
         }
-        passPipeline = currPipeline;
       }
     }
 
-    executableLoweringPipeline.addPass(createSetNumWorkgroupsPass());
-    executableLoweringPipeline.addPass(createCanonicalizerPass());
-    if (!testLoweringConfiguration && passPipeline.hasValue()) {
-      OpPassManager &nestedModulePM =
-          executableLoweringPipeline.nest<ModuleOp>();
-      switch (passPipeline.getValue()) {
-        case IREE::Codegen::DispatchLoweringPassPipeline::CPUDefault:
-        case IREE::Codegen::DispatchLoweringPassPipeline::None:
-          addCPUDefaultPassPipeline(nestedModulePM);
-          break;
-        case IREE::Codegen::DispatchLoweringPassPipeline::CPUVectorization:
-          addCPUVectorizationPassPipeline(nestedModulePM, lowerToVectors);
-          break;
+    // Verify the configuration.
+    if (translationInfo.hasValue()) {
+      LogicalResult verificationStatus = success();
+      switch (translationInfo.getValue().getDispatchLoweringPassPipeline()) {
         case IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors:
-          addTensorToVectorsPassPipeline(nestedModulePM, lowerToVectors);
+          verificationStatus = verifyLoweringConfiguration(
+              moduleOp, translationInfo.getValue(),
+              verifyTensorToVectorsPassPipelineConfig);
           break;
-        case IREE::Codegen::DispatchLoweringPassPipeline::
-            CPUTileFuseAndVectorize:
-          addTensorToVectorsPassPipeline(nestedModulePM, lowerToVectors,
-                                         /*useTileAndVectorizeV2=*/true);
-          break;
-        default:
-          llvm_unreachable("Unsupported pipeline on CPU target.");
+        default:;
+      }
+      if (failed(verificationStatus)) {
+        return signalPassFailure();
+      }
+
+      executableLoweringPipeline.addPass(createSetNumWorkgroupsPass());
+      executableLoweringPipeline.addPass(createCanonicalizerPass());
+      if (!testLoweringConfiguration) {
+        OpPassManager &nestedModulePM =
+            executableLoweringPipeline.nest<ModuleOp>();
+        switch (translationInfo.getValue().getDispatchLoweringPassPipeline()) {
+          case IREE::Codegen::DispatchLoweringPassPipeline::CPUDefault:
+          case IREE::Codegen::DispatchLoweringPassPipeline::None:
+            addCPUDefaultPassPipeline(nestedModulePM);
+            break;
+          case IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors:
+            addTensorToVectorsPassPipeline(nestedModulePM, lowerToVectors);
+            break;
+          case IREE::Codegen::DispatchLoweringPassPipeline::
+              CPUTileFuseAndVectorize:
+            addTensorToVectorsPassPipeline(nestedModulePM, lowerToVectors,
+                                           /*useTileAndVectorizeV2=*/true);
+            break;
+          default:
+            llvm_unreachable("Unsupported pipeline on CPU target.");
+        }
       }
     }
   }

diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorization.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorization.cpp
deleted file mode 100644
index d27153e..0000000
--- a/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorization.cpp
+++ /dev/null

@@ -1,223 +0,0 @@
-// Copyright 2020 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"
-#include "iree/compiler/Codegen/PassDetail.h"
-#include "iree/compiler/Codegen/Passes.h"
-#include "iree/compiler/Codegen/Transforms/Transforms.h"
-#include "iree/compiler/Codegen/Utils/MarkerUtils.h"
-#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
-#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
-#include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"
-#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/MemRef/Transforms/Passes.h"
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
-#include "mlir/Dialect/Vector/VectorTransforms.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-#define DEBUG_TYPE "iree-linalg-to-llvm-tile-and-vectorize"
-
-namespace mlir {
-namespace iree_compiler {
-
-namespace {
-// Could just be linalg::TilingPattern with a ContractionOpInterface filter, but
-// that is always templated on an op.
-struct TileWorkgroups : public linalg::LinalgBaseTilingPattern {
-  using Base = linalg::LinalgBaseTilingPattern;
-  TileWorkgroups(MLIRContext *context, linalg::LinalgTilingOptions options,
-                 linalg::LinalgTransformationFilter marker)
-      : LinalgBaseTilingPattern(context, options, marker) {}
-  LogicalResult matchAndRewrite(Operation *op,
-                                PatternRewriter &rewriter) const override {
-    auto contractionOp = dyn_cast<linalg::ContractionOpInterface>(op);
-    if (!contractionOp) return failure();
-
-    linalg::TiledLinalgOp tiledLinalgOp;
-    if (failed(Base::matchAndRewriteBase(op, rewriter, tiledLinalgOp)) ||
-        !tiledLinalgOp.tensorResults.empty()) {
-      return failure();
-    }
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-
-}  // namespace
-
-namespace {
-struct LLVMCPUVectorizationPass
-    : public LLVMCPUVectorizationBase<LLVMCPUVectorizationPass> {
-  LLVMCPUVectorizationPass(bool vectorize = true) : lowerToVectors(vectorize) {}
-  LLVMCPUVectorizationPass(const LLVMCPUVectorizationPass &pass) {
-    lowerToVectors = pass.lowerToVectors;
-  }
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<linalg::LinalgDialect, AffineDialect, scf::SCFDialect,
-                    vector::VectorDialect>();
-  }
-  void runOnOperation() override;
-
- private:
-  /// TODO(ravishankarm): Option to not generate any `vector.` instructions. The
-  /// VMVX backend uses the same lowering as the CPU pass but there is no
-  /// lowering of these `vector.` operations to scalar code. So as a WAR do the
-  /// same tiling scheme but avoid generating vector instructions. When VMVX can
-  /// handle vector instructions, drop this options.
-  bool lowerToVectors;
-
-  Option<bool> enableVectorContractToAarch64Asm{
-      *this, "vector-contract-to-aarch64-asm",
-      llvm::cl::desc("Enable promoting wokgroup memory to full tiles allocated "
-                     "on the stack."),
-      llvm::cl::init(false)};
-};
-}  // namespace
-
-void LLVMCPUVectorizationPass::runOnOperation() {
-  auto funcOp = getOperation();
-  MLIRContext *context = &getContext();
-
-  // Workgroup first level of tiling.
-  {
-    // First level of tiling patterns. (workgroups memory)
-    RewritePatternSet l1patterns(context);
-    l1patterns.insert<TileWorkgroups>(
-        context,
-        linalg::LinalgTilingOptions().setTileSizeComputationFunction(
-            [](OpBuilder &builder, Operation *op) -> SmallVector<Value, 4> {
-              return getTileSizes(builder, op,
-                                  static_cast<unsigned>(TilingLevel::L1Tiles));
-            }),
-        linalg::LinalgTransformationFilter(
-            ArrayRef<Identifier>{},
-            Identifier::get(getWorkgroupL1TileMarker(), context)));
-
-    (void)applyPatternsAndFoldGreedily(funcOp, std::move(l1patterns));
-  }
-
-  // Second level of tiling. (workgroups memory -> vectors)
-  {
-    RewritePatternSet l2patterns(context);
-    l2patterns.insert<TileWorkgroups>(
-        context,
-        linalg::LinalgTilingOptions().setTileSizeComputationFunction(
-            [](OpBuilder &builder, Operation *op) -> SmallVector<Value, 4> {
-              return getTileSizes(
-                  builder, op, static_cast<unsigned>(TilingLevel::VectorTiles));
-            }),
-        linalg::LinalgTransformationFilter(
-            Identifier::get(getWorkgroupL1TileMarker(), context),
-            Identifier::get(getVectorizeMarker(), context)));
-
-    (void)applyPatternsAndFoldGreedily(funcOp, std::move(l2patterns));
-  }
-
-  // Apply canonicalization.
-  {
-    RewritePatternSet canonicalizationPatterns =
-        linalg::getLinalgTilingCanonicalizationPatterns(context);
-    populateAffineMinCanonicalizationPattern(canonicalizationPatterns);
-    if (failed(applyPatternsAndFoldGreedily(
-            funcOp, std::move(canonicalizationPatterns)))) {
-      return signalPassFailure();
-    }
-  }
-
-  if (!lowerToVectors) {
-    return;
-  }
-
-  // Op specific conversion.
-  {
-    RewritePatternSet vectorizeOpsPattenrs(context);
-    populateLinalgToVectorVectorizeMMT4dPatterns(context, vectorizeOpsPattenrs);
-    if (failed(applyPatternsAndFoldGreedily(funcOp,
-                                            std::move(vectorizeOpsPattenrs)))) {
-      return signalPassFailure();
-    }
-  }
-
-  // Apply vectorization patterns.
-  {
-    RewritePatternSet vectorizationPatterns(context);
-    linalg::insertVectorizationPatterns<linalg::ContractionOpInterface,
-                                        linalg::CopyOp, linalg::FillOp>(
-        vectorizationPatterns, linalg::LinalgVectorizationOptions(),
-        linalg::LinalgTransformationFilter(
-            Identifier::get(getVectorizeMarker(), context)));
-    vector::populateVectorTransferPermutationMapLoweringPatterns(
-        vectorizationPatterns);
-    vector::populateVectorReductionToContractPatterns(vectorizationPatterns);
-    (void)applyPatternsAndFoldGreedily(funcOp,
-                                       std::move(vectorizationPatterns));
-  }
-
-  {
-    // Fold consumer add ops into the contraction op itself.
-    RewritePatternSet canonicalizationPatterns(context);
-    vector::ContractionOp::getCanonicalizationPatterns(canonicalizationPatterns,
-                                                       context);
-    (void)applyPatternsAndFoldGreedily(funcOp,
-                                       std::move(canonicalizationPatterns));
-  }
-
-  if (enableVectorContractToAarch64Asm) {
-    RewritePatternSet vectorToAArch64AsmPatterns(context);
-    populateVectorContractToAArch64InlineAsm(vectorToAArch64AsmPatterns,
-                                             context);
-    (void)applyPatternsAndFoldGreedily(funcOp,
-                                       std::move(vectorToAArch64AsmPatterns));
-  }
-
-  // Apply vector specific operation lowering.
-  {
-    vector::VectorTransformsOptions vectorTransformsOptions =
-        vector::VectorTransformsOptions().setVectorTransformsOptions(
-            vector::VectorContractLowering::OuterProduct);
-    RewritePatternSet vectorContractLoweringPatterns(context);
-    vectorContractLoweringPatterns.insert<
-        vector::ContractionOpToOuterProductOpLowering,
-        vector::ContractionOpToMatmulOpLowering, vector::ContractionOpLowering>(
-        vectorTransformsOptions, context);
-    vector::populateVectorTransferPermutationMapLoweringPatterns(
-        vectorContractLoweringPatterns);
-    (void)applyPatternsAndFoldGreedily(
-        funcOp, std::move(vectorContractLoweringPatterns));
-  }
-
-  // Hosit hierarchical tiling indexing and other loop invariant transfer
-  // ops computation.
-
-  // Programmatic controlled lowering of vector.transfer only.
-  {
-    VectorTransferToSCFOptions vectorToSCFOptions =
-        VectorTransferToSCFOptions().enableFullUnroll();
-    RewritePatternSet vectorToLoopsPatterns(context);
-    populateVectorToSCFConversionPatterns(vectorToLoopsPatterns,
-                                          vectorToSCFOptions);
-    // Hosit hierarchical tiling indexing and other loop invariant transfer
-    // ops computation.
-    linalg::hoistRedundantVectorTransfers(funcOp);
-
-    memref::populateFoldSubViewOpPatterns(vectorToLoopsPatterns);
-    (void)applyPatternsAndFoldGreedily(funcOp,
-                                       std::move(vectorToLoopsPatterns));
-  }
-}
-
-std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUVectorizationPass(
-    bool lowerToVectors) {
-  return std::make_unique<LLVMCPUVectorizationPass>(lowerToVectors);
-}
-
-}  // namespace iree_compiler
-}  // namespace mlir

diff --git a/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 33f3999..cb0a934 100644
--- a/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/Passes.cpp

@@ -7,6 +7,7 @@
 #include "iree/compiler/Codegen/Passes.h"
 
 #include "iree-dialects/Dialect/LinalgExt/Transforms/Passes.h"
+#include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"
 #include "iree/compiler/Codegen/PassDetail.h"
 #include "iree/compiler/Dialect/Shape/Transforms/Passes.h"
 #include "mlir/Conversion/SCFToStandard/SCFToStandard.h"
@@ -26,26 +27,87 @@
   return builder.create<memref::AllocaOp>(loc, allocType, dynamicSizes);
 }
 
-void addCPUVectorizationPassPipeline(OpPassManager &passManager,
-                                     bool lowerToVectors) {
-  passManager.addPass(createCanonicalizerPass());
+LogicalResult verifyTensorToVectorsPassPipelineConfig(
+    Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
+    IREE::Codegen::TranslationInfoAttr translationInfo,
+    ArrayRef<int64_t> workgroupSize) {
+  if (!workgroupSize.empty()) {
+    return op->emitOpError(
+        "expected workgroup size to be empty for CPU pipelines");
+  }
 
-  // TODO(ataei): This causes segmentation fault on Android. Fix it and
-  // re-enable.
-  // passManager.addNestedPass<FuncOp>(createPadLinalgWorkgroupTilesPass());
+  // Verify that the translation info is using the right pipeline.
+  auto pipeline =
+      IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors;
+  StringRef pipelineName = stringifyEnum(pipeline);
+  if (translationInfo.getDispatchLoweringPassPipeline() != pipeline) {
+    return op->emitOpError("expected pipeline in translation.info to be ")
+           << pipelineName;
+  }
 
-  // Use stack allocation on CPU side.
-  addLinalgBufferizePasses(passManager, cpuAllocationFunction);
-  passManager.addNestedPass<FuncOp>(createCSEPass());
-  passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
+  // Verify that the workload per workgroup is set and is non-zero.
+  SmallVector<int64_t> workloadPerWorkgroup =
+      translationInfo.getWorkloadPerWorkgroupVals();
+  SmallVector<unsigned> partitionedLoops = getPartitionedLoops(op);
+  if (workloadPerWorkgroup.size() != partitionedLoops.size()) {
+    return op->emitOpError("expected ")
+           << partitionedLoops.size()
+           << " entries for workload_per_wg, but got "
+           << workloadPerWorkgroup.size();
+  }
+  if (llvm::any_of(workloadPerWorkgroup,
+                   [](int64_t val) { return val == 0; })) {
+    return op->emitOpError("invalid to use 0 in workload_per_wg");
+  }
 
-  // Tile and vectorize linalg ops on buffers.
-  passManager.addNestedPass<FuncOp>(
-      createLLVMCPUVectorizationPass(lowerToVectors));
-  passManager.addNestedPass<FuncOp>(createCSEPass());
-  passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
+  if (loweringConfig.getTileSizes().size() != 3) {
+    return op->emitOpError("expected three levels of tile sizes for ")
+           << pipelineName << ", got " << loweringConfig.getTileSizes().size();
+  }
+  SmallVector<int64_t> firstLevelTileSizes = loweringConfig.getTileSizeVals(0);
+  if (!firstLevelTileSizes.empty()) {
+    // Verify that if the first-level tile sizes are set, they are the same as
+    // workload_per_wg for the partitioned loops.
+    size_t minElements =
+        (partitionedLoops.empty() ? 0 : partitionedLoops.back() + 1);
+    if (firstLevelTileSizes.size() < minElements) {
+      return op->emitOpError("expected at least ")
+             << minElements
+             << " size for first level tiling to get the distribution fully "
+                "specified.";
+    }
+    llvm::SmallDenseSet<unsigned> partitionedLoopsSet;
+    partitionedLoopsSet.insert(partitionedLoops.begin(),
+                               partitionedLoops.end());
+    SmallVector<int64_t> partitionedTileSizes;
+    for (auto tileSize : llvm::enumerate(firstLevelTileSizes)) {
+      if (!partitionedLoopsSet.count(tileSize.index())) {
+        continue;
+      }
+      partitionedTileSizes.push_back(tileSize.value());
+    }
+    for (auto val : llvm::enumerate(llvm::reverse(workloadPerWorkgroup))) {
+      if (val.value() != partitionedTileSizes[val.index()]) {
+        return op->emitOpError("mismatch in distributed tile size value ")
+               << partitionedTileSizes[val.index()] << " at position "
+               << val.index() << " and workload_per_wg value " << val.value();
+      }
+    }
+  }
 
-  passManager.addNestedPass<FuncOp>(createForOpCanonicalizationPass());
+  // Verify that native vector size is either empty, or if set is same as the
+  // last level of tiling
+  SmallVector<int64_t> nativeVectorSize =
+      loweringConfig.getNativeVectorSizeVals();
+  if (!nativeVectorSize.empty()) {
+    if (nativeVectorSize !=
+        loweringConfig.getTileSizeVals(
+            static_cast<unsigned>(TilingLevel::VectorTiles))) {
+      return op->emitOpError(
+          "native_vector_size must be same as the last level of tiling");
+    }
+  }
+  return success();
 }
 
 void addTensorToVectorsPassPipeline(OpPassManager &passManager,

diff --git a/iree/compiler/Codegen/LLVMCPU/test/BUILD b/iree/compiler/Codegen/LLVMCPU/test/BUILD
index 0c8c8c6..342b3ca 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/BUILD
+++ b/iree/compiler/Codegen/LLVMCPU/test/BUILD

@@ -22,8 +22,8 @@
             "hal_interface_bindings.mlir",
             "hal_interface_constants.mlir",
             "hal_interface_workgroup_info.mlir",
+            "illegal_configuration.mlir",
             "materialize_launch_configuration.mlir",
-            "matmul_vectorization.mlir",
             "synchronize_symbol_visibility.mlir",
             "test_config_mmt4d.mlir",
             "tile_and_vectorize.mlir",

diff --git a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
index ac58c8a..41f64cb 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
+++ b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt

@@ -17,8 +17,8 @@
     "hal_interface_bindings.mlir"
     "hal_interface_constants.mlir"
     "hal_interface_workgroup_info.mlir"
+    "illegal_configuration.mlir"
     "materialize_launch_configuration.mlir"
-    "matmul_vectorization.mlir"
     "synchronize_symbol_visibility.mlir"
     "test_config_mmt4d.mlir"
     "tile_and_vectorize.mlir"

diff --git a/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir
new file mode 100644
index 0000000..57b4cc6
--- /dev/null
+++ b/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir

@@ -0,0 +1,154 @@
+// RUN: iree-opt -pass-pipeline='hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target{test-lowering-configuration=true}))' -verify-diagnostics -split-input-file %s 
+
+#config = #iree_codegen.lowering.config<tile_sizes = [], native_vector_size = []>
+#translation = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = []>
+hal.executable private @matmul_tensors  {
+  hal.interface @io {
+    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
+    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
+    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+  }
+  hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64", {}> {
+    hal.executable.entry_point @illegal attributes {
+      translation.info = #translation,
+      interface = @io,
+      ordinal = 0 : index
+    }
+    builtin.module {
+      func @illegal() {
+        %c0 = arith.constant 0 : index
+        %lhs = hal.interface.binding.subspan @io::@arg0[%c0] : memref<4x8xf32>
+        %rhs = hal.interface.binding.subspan @io::@arg1[%c0] : memref<8x16xf32>
+        %result = hal.interface.binding.subspan @io::@ret0[%c0] : memref<4x16xf32>
+        // expected-error @+1 {{expected 2 entries for workload_per_wg, but got 0}}
+        linalg.matmul {lowering.config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
+          outs(%result: memref<4x16xf32>)
+        return
+      }
+    }
+  }
+}
+
+// -----
+
+#config = #iree_codegen.lowering.config<tile_sizes = [], native_vector_size = []>
+#translation = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [1, 0]>
+hal.executable private @matmul_tensors  {
+  hal.interface @io {
+    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
+    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
+    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+  }
+  hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64", {}> {
+    hal.executable.entry_point @illegal attributes {
+      translation.info = #translation,
+      interface = @io,
+      ordinal = 0 : index
+    }
+    builtin.module {
+      func @illegal() {
+        %c0 = arith.constant 0 : index
+        %lhs = hal.interface.binding.subspan @io::@arg0[%c0] : memref<4x8xf32>
+        %rhs = hal.interface.binding.subspan @io::@arg1[%c0] : memref<8x16xf32>
+        %result = hal.interface.binding.subspan @io::@ret0[%c0] : memref<4x16xf32>
+        // expected-error @+1 {{invalid to use 0 in workload_per_wg}}
+        linalg.matmul {lowering.config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
+          outs(%result: memref<4x16xf32>)
+        return
+      }
+    }
+  }
+}
+
+// -----
+
+#config = #iree_codegen.lowering.config<tile_sizes = [], native_vector_size = []>
+#translation = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [1, 1]>
+hal.executable private @matmul_tensors  {
+  hal.interface @io {
+    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
+    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
+    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+  }
+  hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64", {}> {
+    hal.executable.entry_point @illegal attributes {
+      translation.info = #translation,
+      interface = @io,
+      ordinal = 0 : index
+    }
+    builtin.module {
+      func @illegal() {
+        %c0 = arith.constant 0 : index
+        %lhs = hal.interface.binding.subspan @io::@arg0[%c0] : memref<4x8xf32>
+        %rhs = hal.interface.binding.subspan @io::@arg1[%c0] : memref<8x16xf32>
+        %result = hal.interface.binding.subspan @io::@ret0[%c0] : memref<4x16xf32>
+        // expected-error @+1 {{expected three levels of tile sizes for CPUTensorToVectors, got 0}}
+        linalg.matmul {lowering.config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
+          outs(%result: memref<4x16xf32>)
+        return
+      }
+    }
+  }
+}
+
+// -----
+
+#config = #iree_codegen.lowering.config<tile_sizes = [[4, 8], [], []], native_vector_size = []>
+#translation = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [8, 6]>
+hal.executable private @matmul_tensors  {
+  hal.interface @io {
+    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
+    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
+    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+  }
+  hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64", {}> {
+    hal.executable.entry_point @illegal attributes {
+      translation.info = #translation,
+      interface = @io,
+      ordinal = 0 : index
+    }
+    builtin.module {
+      func @illegal() {
+        %c0 = arith.constant 0 : index
+        %lhs = hal.interface.binding.subspan @io::@arg0[%c0] : memref<4x8xf32>
+        %rhs = hal.interface.binding.subspan @io::@arg1[%c0] : memref<8x16xf32>
+        %result = hal.interface.binding.subspan @io::@ret0[%c0] : memref<4x16xf32>
+        // expected-error @+1 {{mismatch in distributed tile size value 4 at position 0 and workload_per_wg value 6}}
+        linalg.matmul {lowering.config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
+          outs(%result: memref<4x16xf32>)
+        return
+      }
+    }
+  }
+}
+
+// -----
+
+#config = #iree_codegen.lowering.config<tile_sizes = [[], [], [8, 8, 8]], native_vector_size = [4, 4, 4]>
+#translation = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [8, 4]>
+hal.executable private @matmul_tensors  {
+  hal.interface @io {
+    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
+    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
+    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+  }
+  hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64", {}> {
+    hal.executable.entry_point @illegal attributes {
+      translation.info = #translation,
+      interface = @io,
+      ordinal = 0 : index
+    }
+    builtin.module {
+      func @illegal() {
+        %c0 = arith.constant 0 : index
+        %lhs = hal.interface.binding.subspan @io::@arg0[%c0] : memref<4x8xf32>
+        %rhs = hal.interface.binding.subspan @io::@arg1[%c0] : memref<8x16xf32>
+        %result = hal.interface.binding.subspan @io::@ret0[%c0] : memref<4x16xf32>
+        // expected-error @+1 {{native_vector_size must be same as the last level of tiling}}
+        linalg.matmul {lowering.config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
+          outs(%result: memref<4x16xf32>)
+        return
+      }
+    }
+  }
+}

diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
index 11bc34b..a521311 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir

@@ -19,16 +19,13 @@
       func @matmul_tensors() {
         %c0 = arith.constant 0 : index
         %c1 = arith.constant 1 : index
-        %pcM = hal.interface.load.constant offset = 0 : index
-        %pcN = hal.interface.load.constant offset = 1 : index
-        %pcK = hal.interface.load.constant offset = 2 : index
-        %0 = hal.interface.binding.subspan @io::@arg0[%c0] : memref<?x?xf32>{%pcM, %pcK}
-        %2 = hal.interface.binding.subspan @io::@arg1[%c0] : memref<?x?xf32>{%pcK, %pcN}
-        %4 = hal.interface.binding.subspan @io::@arg2[%c0] : memref<?x?xf32>{%pcM, %pcN}
-        %6 = hal.interface.binding.subspan @io::@ret0[%c0] : memref<?x?xf32>{%pcM, %pcN}
-        %M = memref.dim %0, %c0 : memref<?x?xf32>
-        %N = memref.dim %2, %c1 : memref<?x?xf32>
-        %K = memref.dim %0, %c1 : memref<?x?xf32>
+        %M = hal.interface.load.constant offset = 0 : index
+        %N = hal.interface.load.constant offset = 1 : index
+        %K = hal.interface.load.constant offset = 2 : index
+        %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K}
+        %2 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N}
+        %4 = hal.interface.binding.subspan @io::@arg2[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %N}
+        %6 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%M, %N}
         %workgroup_size_x = hal.interface.workgroup.size[0] : index
         %workgroup_size_y = hal.interface.workgroup.size[1] : index
         %workgroup_id_x = hal.interface.workgroup.id[0] : index
@@ -42,15 +39,12 @@
           %11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_size_x, %workgroup_count_x]
           scf.for %arg1 = %10 to %N step %11 {
             %12 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %N]
-            %13 = memref.subview %0[%arg0, 0] [%12, %K] [1, 1] : memref<?x?xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
+            %13 = flow.dispatch.tensor.load %0, offsets=[%arg0, 0], sizes=[%12, %K], strides=[1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
             %14 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %M]
-            %15 = memref.subview %2[0, %arg1] [%K, %14] [1, 1] : memref<?x?xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
-            %16 = memref.subview %4[%arg0, %arg1] [%12, %14] [1, 1] : memref<?x?xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
-            %17 = memref.alloc(%12, %14) : memref<?x?xf32>
-            linalg.copy(%16, %17) : memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, memref<?x?xf32>
-            linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%13, %15 : memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>) outs(%17 : memref<?x?xf32>)
-            %18 = memref.subview %6[%arg0, %arg1] [%12, %14] [1, 1] : memref<?x?xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
-            linalg.copy(%17, %18) : memref<?x?xf32>, memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
+            %15 = flow.dispatch.tensor.load %2, offsets=[0, %arg1], sizes=[%K, %14], strides=[1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
+            %16 = flow.dispatch.tensor.load %4, offsets=[%arg0, %arg1], sizes=[%12, %14], strides=[1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
+            %17 = linalg.matmul ins(%13, %15 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
+            flow.dispatch.tensor.store %17, %6, offsets=[%arg0, %arg1], sizes=[%12, %14], strides=[1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
           }
         }
         return
@@ -96,19 +90,23 @@
         %c0 = arith.constant 0 : index
         %dim0 = hal.interface.load.constant offset = 0 : index
         %dim1 = hal.interface.load.constant offset = 1 : index
-        %0 = hal.interface.binding.subspan @io::@arg0[%c0] : memref<?x?xf32>{%dim0, %dim1}
-        %1 = hal.interface.binding.subspan @io::@arg1[%c0] : memref<?xf32>{%dim1}
-        %2 = hal.interface.binding.subspan @io::@ret0[%c0] : memref<?x?xf32>{%dim0, %dim1}
-        linalg.generic {
+        %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%dim0, %dim1}
+        %1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?xf32>{%dim1}
+        %2 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%dim0, %dim1}
+        %3 = flow.dispatch.tensor.load %0, offsets=[], sizes=[], strides=[] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
+        %4 = flow.dispatch.tensor.load %1, offsets=[], sizes=[], strides=[] : !flow.dispatch.tensor<readonly:?xf32> -> tensor<?xf32>
+        %5 = linalg.init_tensor [%dim0, %dim1] : tensor<?x?xf32>
+        %6 = linalg.generic {
           indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                            affine_map<(d0, d1) -> (d1)>,
                            affine_map<(d0, d1) -> (d0, d1)>],
           iterator_types = ["parallel", "parallel"]}
-          ins(%0, %1 : memref<?x?xf32>, memref<?xf32>) outs(%2 : memref<?x?xf32>) {
+          ins(%3, %4 : tensor<?x?xf32>, tensor<?xf32>) outs(%5 : tensor<?x?xf32>) {
           ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):  // no predecessors
-            %3 = arith.addf %arg0, %arg1 : f32
-            linalg.yield %3 : f32
-          }
+            %7 = arith.addf %arg0, %arg1 : f32
+            linalg.yield %7 : f32
+          } -> tensor<?x?xf32>
+        flow.dispatch.tensor.store %6, %2, offsets = [], sizes = [], strides = [] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
         return
       }
       hal.interface private @io  {
@@ -382,23 +380,22 @@
 }
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[], [1, 32, 32, 32], [1, 4, 4, 4]{{\]}}, native_vector_size = [1, 4, 4, 4]>
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 64)>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [64, 64]>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [64, 64, 1]>
 //      CHECK: hal.executable.entry_point public @batch_matmul_tensors
 // CHECK-NEXT: (%[[ARG0:[a-zA-Z0-9]+]]: index
 // CHECK-SAME:  %[[ARG1:[a-zA-Z0-9]+]]: index
 // CHECK-SAME:  %[[ARG2:[a-zA-Z0-9]+]]: index)
-//  CHECK-DAG:  %[[C1:.+]] = arith.constant 1 : index
 //  CHECK-DAG:  %[[D0:.+]] = affine.apply #[[MAP0]]()[%[[ARG0]]]
 //  CHECK-DAG:  %[[D1:.+]] = affine.apply #[[MAP0]]()[%[[ARG1]]]
-//      CHECK:  hal.return %[[D0]], %[[D1]], %[[C1]]
+//      CHECK:  hal.return %[[D0]], %[[D1]], %[[ARG2]]
 //      CHECK:  linalg.batch_matmul
 // CHECK-SAME:    lowering.config = #[[CONFIG]]
 
 // -----
 
 #compilation = #iree_codegen.compilation.info<
-    #iree_codegen.lowering.config<tile_sizes = [[32, 32, 32]], native_vector_size = []>,
-    #iree_codegen.translation.info<"CPUVectorization", workload_per_wg = [32, 32]>,
+    #iree_codegen.lowering.config<tile_sizes = [[], [32, 32, 32], [4, 4, 4]], native_vector_size = [4, 4, 4]>,
+    #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [32, 32]>,
     workgroup_size = []>
 hal.executable private @preset_config_matmul_tensors  {
   hal.executable.variant @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64"> {
@@ -452,10 +449,10 @@
     }
   }
 }
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[32, 32, 32]{{\]}}, native_vector_size = []>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[], [32, 32, 32], [4, 4, 4]{{\]}}, native_vector_size = [4, 4, 4]>
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 32)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 * 32)>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUVectorization", workload_per_wg = [32, 32]>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [32, 32]>
 //      CHECK: hal.executable.entry_point
 // CHECK-SAME:     translation.info = #[[TRANSLATION]]
 // CHECK-NEXT:   ^bb0(%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index

diff --git a/iree/compiler/Codegen/LLVMCPU/test/matmul_vectorization.mlir b/iree/compiler/Codegen/LLVMCPU/test/matmul_vectorization.mlir
deleted file mode 100644
index 80d6213..0000000
--- a/iree/compiler/Codegen/LLVMCPU/test/matmul_vectorization.mlir
+++ /dev/null

@@ -1,143 +0,0 @@
-// RUN: iree-opt -pass-pipeline="hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target{use-lowering-pipeline='builtin.func(iree-llvmcpu-vectorization)'}))" -split-input-file %s | IreeFileCheck %s
-
-#config = #iree_codegen.lowering.config<tile_sizes = [[64, 64], [32, 32, 32], [4, 4, 4]], native_vector_size = [4, 4, 4]>
-hal.executable private @dynamic_matmul  {
-  hal.interface @io {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-  }
-  hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64"> {
-    hal.executable.entry_point @matmul_128x128x128 attributes {
-      interface = @io,
-      ordinal = 0 : index
-    }
-    builtin.module {
-      func @matmul_128x128x128() {
-        %c0 = arith.constant 0 : index
-        %c128 = arith.constant 128 : index
-        %0 = hal.interface.binding.subspan @io::@arg0[%c0] : memref<128x128xf32>
-        %1 = hal.interface.binding.subspan @io::@arg1[%c0] : memref<128x128xf32>
-        %2 = hal.interface.binding.subspan @io::@ret0[%c0] : memref<128x128xf32>
-        %workgroup_id_x = hal.interface.workgroup.id[0] : index
-        %workgroup_count_x = hal.interface.workgroup.count[0] : index
-        %workgroup_id_y = hal.interface.workgroup.id[1] : index
-        %workgroup_count_y = hal.interface.workgroup.count[1] : index
-        %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
-        %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
-        scf.for %arg0 = %3 to %c128 step %4 {
-          %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
-          %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
-          scf.for %arg1 = %5 to %c128 step %6 {
-            %7 = memref.subview %0[%arg0, 0] [64, 128] [1, 1] : memref<128x128xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
-            %8 = memref.subview %1[0, %arg1] [128, 64] [1, 1] : memref<128x128xf32> to memref<128x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
-            %9 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32> to memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
-            linalg.matmul {lowering.config = #config} ins(%7, %8 : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>, memref<128x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>) outs(%9 : memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>)
-          }
-        }
-        return
-      }
-    }
-  }
-}
-// CHECK-LABEL: func @matmul_128x128x128
-//   CHECK-DAG:   %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0
-//   CHECK-DAG:   %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1
-//   CHECK-DAG:   %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0
-//   CHECK-DAG:   %[[START:.+]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[WORGKROUP_SIZE:.+]] = arith.constant 64
-//   CHECK-DAG:   %[[VECTOR_SIZE:.+]] = arith.constant 4
-//   CHECK-DAG:   %[[L1_SIZE:.+]] = arith.constant 32
-//   CHECK-DAG:   %[[KDIM_SIZE:.+]] = arith.constant 128
-//       CHECK:   scf.for
-//       CHECK:     scf.for
-//       CHECK:       scf.for {{.*}} = %[[START]] to %[[WORGKROUP_SIZE]] step %[[L1_SIZE]] {
-//       CHECK:         scf.for {{.*}} = %[[START]] to %[[WORGKROUP_SIZE]] step %[[L1_SIZE]] {
-//       CHECK:           scf.for {{.*}} = %[[START]] to %[[KDIM_SIZE]] step %[[L1_SIZE]] {
-//       CHECK:             scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] {
-//       CHECK:               scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] {
-//       CHECK:                 %[[VEC_C_0:.+]] = vector.transfer_read %[[RET0]]
-//       CHECK:                 %[[VEC_C_1:.+]] = vector.transfer_read %[[RET0]]
-//       CHECK:                 %[[VEC_C_2:.+]] = vector.transfer_read %[[RET0]]
-//       CHECK:                 %[[VEC_C_3:.+]] = vector.transfer_read %[[RET0]]
-//       CHECK:                 scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]]
-//       CHECK:                   %[[VEC_A_0:.+]] = vector.transfer_read %[[ARG0]]
-//       CHECK:                   %[[VEC_A_1:.+]] = vector.transfer_read %[[ARG0]]
-//       CHECK:                   %[[VEC_A_2:.+]] = vector.transfer_read %[[ARG0]]
-//       CHECK:                   %[[VEC_A_3:.+]] = vector.transfer_read %[[ARG0]]
-//       CHECK:                   %[[VEC_B_0:.+]] = vector.transfer_read %[[ARG1]]
-//       CHECK:                   %[[VEC_b_1:.+]] = vector.transfer_read %[[ARG1]]
-//       CHECK:                   %[[VEC_B_2:.+]] = vector.transfer_read %[[ARG1]]
-//       CHECK:                   %[[VEC_B_3:.+]] = vector.transfer_read %[[ARG1]]
-
-// -----
-
-#config = #iree_codegen.lowering.config<tile_sizes = [[64, 64], [32, 32, 32], [4, 4, 4]], native_vector_size = [4, 4, 4]>
-hal.executable private @matmul_i8_i8_i32  {
-  hal.interface @io {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-  }
-  hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64"> {
-    hal.executable.entry_point @matmul_i8_i8_i32_128x128x128 attributes {
-      interface = @io,
-      ordinal = 0 : index
-    }
-    builtin.module {
-      func @matmul_i8_i8_i32_128x128x128() {
-        %c0 = arith.constant 0 : index
-        %c128 = arith.constant 128 : index
-        %0 = hal.interface.binding.subspan @io::@arg0[%c0] : memref<128x128xi8>
-        %1 = hal.interface.binding.subspan @io::@arg1[%c0] : memref<128x128xi8>
-        %2 = hal.interface.binding.subspan @io::@ret0[%c0] : memref<128x128xi32>
-        %workgroup_id_x = hal.interface.workgroup.id[0] : index
-        %workgroup_count_x = hal.interface.workgroup.count[0] : index
-        %workgroup_id_y = hal.interface.workgroup.id[1] : index
-        %workgroup_count_y = hal.interface.workgroup.count[1] : index
-        %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
-        %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
-        scf.for %arg0 = %3 to %c128 step %4 {
-          %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
-          %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
-          scf.for %arg1 = %5 to %c128 step %6 {
-            %7 = memref.subview %0[%arg0, 0] [64, 128] [1, 1] : memref<128x128xi8> to memref<64x128xi8, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
-            %8 = memref.subview %1[0, %arg1] [128, 64] [1, 1] : memref<128x128xi8> to memref<128x64xi8, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
-            %9 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32> to memref<64x64xi32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
-            linalg.matmul {lowering.config = #config} ins(%7, %8 : memref<64x128xi8, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>, memref<128x64xi8, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>) outs(%9 : memref<64x64xi32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>)
-          }
-        }
-        return
-      }
-    }
-  }
-}
-// CHECK-LABEL: func @matmul_i8_i8_i32_128x128x128
-//   CHECK-DAG:   %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0
-//   CHECK-DAG:   %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1
-//   CHECK-DAG:   %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0
-//   CHECK-DAG:   %[[START:.+]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[WORGKROUP_SIZE:.+]] = arith.constant 64
-//   CHECK-DAG:   %[[VECTOR_SIZE:.+]] = arith.constant 4
-//   CHECK-DAG:   %[[L1_SIZE:.+]] = arith.constant 32
-//   CHECK-DAG:   %[[KDIM_SIZE:.+]] = arith.constant 128
-//       CHECK:   scf.for
-//       CHECK:     scf.for
-//       CHECK:       scf.for {{.*}} = %[[START]] to %[[WORGKROUP_SIZE]] step %[[L1_SIZE]] {
-//       CHECK:         scf.for {{.*}} = %[[START]] to %[[WORGKROUP_SIZE]] step %[[L1_SIZE]] {
-//       CHECK:           scf.for {{.*}} = %[[START]] to %[[KDIM_SIZE]] step %[[L1_SIZE]] {
-//       CHECK:             scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] {
-//       CHECK:               scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] {
-//       CHECK:                 %[[VEC_C_0:.+]] = vector.transfer_read %[[RET0]]
-//       CHECK:                 %[[VEC_C_1:.+]] = vector.transfer_read %[[RET0]]
-//       CHECK:                 %[[VEC_C_2:.+]] = vector.transfer_read %[[RET0]]
-//       CHECK:                 %[[VEC_C_3:.+]] = vector.transfer_read %[[RET0]]
-//       CHECK:                   scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]]
-//       CHECK:                     %[[VEC_A_0:.+]] = vector.transfer_read %[[ARG0]]
-//       CHECK:                     %[[VEC_A_1:.+]] = vector.transfer_read %[[ARG0]]
-//       CHECK:                     %[[VEC_A_2:.+]] = vector.transfer_read %[[ARG0]]
-//       CHECK:                     %[[VEC_A_3:.+]] = vector.transfer_read %[[ARG0]]
-//       CHECK:                     %[[VEC_B_0:.+]] = vector.transfer_read %[[ARG1]]
-//       CHECK:                     %[[VEC_b_1:.+]] = vector.transfer_read %[[ARG1]]
-//       CHECK:                     %[[VEC_B_2:.+]] = vector.transfer_read %[[ARG1]]
-//       CHECK:                     %[[VEC_B_3:.+]] = vector.transfer_read %[[ARG1]]

diff --git a/iree/compiler/Codegen/Passes.cpp b/iree/compiler/Codegen/Passes.cpp
index 19acfec..ae9e935 100644
--- a/iree/compiler/Codegen/Passes.cpp
+++ b/iree/compiler/Codegen/Passes.cpp

@@ -6,6 +6,8 @@
 
 #include "iree/compiler/Codegen/Passes.h"
 
+#include "iree/compiler/Codegen/Dialect/LoweringConfig.h"
+
 namespace mlir {
 namespace iree_compiler {
 
@@ -48,5 +50,21 @@
       });
 }
 
+/// Hook to verify the lowering configuration and translation info for an
+/// operation.
+LogicalResult verifyLoweringConfiguration(
+    Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
+    IREE::Codegen::TranslationInfoAttr translationInfo,
+    ArrayRef<int64_t> workgroupSize) {
+  switch (translationInfo.getDispatchLoweringPassPipeline()) {
+    case IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors:
+      return verifyTensorToVectorsPassPipelineConfig(op, loweringConfig,
+                                                     translationInfo);
+    default:
+      break;
+  }
+  return success();
+}
+
 }  // namespace iree_compiler
 }  // namespace mlir

diff --git a/iree/compiler/Codegen/Passes.h b/iree/compiler/Codegen/Passes.h
index dd02ee4..516be98 100644
--- a/iree/compiler/Codegen/Passes.h
+++ b/iree/compiler/Codegen/Passes.h

@@ -9,6 +9,7 @@
 
 #include <memory>
 
+#include "iree/compiler/Codegen/Dialect/LoweringConfig.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "mlir/Dialect/Linalg/ComprehensiveBufferize/ComprehensiveBufferize.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
@@ -19,9 +20,15 @@
 namespace mlir {
 namespace iree_compiler {
 
-// Registers all conversion passes in this directory.
+/// Registers all conversion passes in this directory.
 void registerCodegenPasses();
 
+/// Verify that the configuration used for compilation is valid.
+LogicalResult verifyLoweringConfiguration(
+    Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
+    IREE::Codegen::TranslationInfoAttr translationInfo,
+    ArrayRef<int64_t> workgroupSize = {});
+
 //------------------------------------------------------------------------------
 // Misc/common conversions
 //------------------------------------------------------------------------------
@@ -184,13 +191,12 @@
 /// to memrefs
 void addCPUDefaultPassPipeline(OpPassManager &passManager);
 
-/// Populates the passes needed to lower to vector operations using linalg based
-/// progressive lowering with vectorization after bufferization.
-void addCPUVectorizationPassPipeline(OpPassManager &passManager,
-                                     bool lowerToVectors = true);
-
 /// Populates the passes needed to multi level tile and lowering of linalg ops
 /// on tensors to vectors operations.
+LogicalResult verifyTensorToVectorsPassPipelineConfig(
+    Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
+    IREE::Codegen::TranslationInfoAttr translationInfo,
+    ArrayRef<int64_t> workgroupSize = {});
 void addTensorToVectorsPassPipeline(OpPassManager &passManager,
                                     bool lowerToVectors = true,
                                     bool useTileAndVectorizeV2 = false);

diff --git a/iree/compiler/Codegen/Passes.td b/iree/compiler/Codegen/Passes.td
index ef1570f..0c8f923 100644
--- a/iree/compiler/Codegen/Passes.td
+++ b/iree/compiler/Codegen/Passes.td

@@ -153,12 +153,6 @@
   let constructor = "mlir::iree_compiler::createLLVMCPUUnfuseFMAOpsPass()";
 }
 
-def LLVMCPUVectorization :
-    Pass<"iree-llvmcpu-vectorization", "FuncOp"> {
-  let summary = "Tile and vectorize for CPU backends";
-  let constructor = "mlir::iree_compiler::createLLVMCPUVectorizationPass()";
-}
-
 def VectorToAArch64InlineAsm :
     Pass<"iree-llvmcpu-vector-to-aarch64-inline-asm", "FuncOp"> {
   let summary = "Convert vector operations to aarch64 inline asm LLVMIR dialect";

diff --git a/iree/test/e2e/regression/lowering_config.mlir b/iree/test/e2e/regression/lowering_config.mlir
index 17b401f..ab4857d 100644
--- a/iree/test/e2e/regression/lowering_config.mlir
+++ b/iree/test/e2e/regression/lowering_config.mlir

@@ -1,10 +1,10 @@
 #compilation0 = #iree_codegen.compilation.info<
-    #iree_codegen.lowering.config<tile_sizes = [[32, 32, 32]], native_vector_size = []>,
-    #iree_codegen.translation.info<"CPUVectorization", workload_per_wg = [32, 32]>,
+    #iree_codegen.lowering.config<tile_sizes = [[], [32, 32, 32], [4, 4, 4]], native_vector_size = [4, 4, 4]>,
+    #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [32, 32]>,
     workgroup_size = []>
 #compilation1 = #iree_codegen.compilation.info<
-    #iree_codegen.lowering.config<tile_sizes = [[64, 64, 64]], native_vector_size = []>,
-    #iree_codegen.translation.info<"CPUVectorization", workload_per_wg = [64, 64]>,
+    #iree_codegen.lowering.config<tile_sizes = [[], [64, 64, 64], [16, 16, 16]], native_vector_size = [16, 16, 16]>,
+    #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [64, 64]>,
     workgroup_size = []>
 func @lowering_config_test() {
   %a = util.unfoldable_constant dense<1.0> : tensor<128x256xf32>