Remove buffer based pipeline on CPU. (#7665)

diff --git a/iree/compiler/Codegen/Dialect/LoweringConfig.td b/iree/compiler/Codegen/Dialect/LoweringConfig.td
index 62124cf..95251a1 100644
--- a/iree/compiler/Codegen/Dialect/LoweringConfig.td
+++ b/iree/compiler/Codegen/Dialect/LoweringConfig.td
@@ -12,8 +12,6 @@
 // List of pre-existing pipelines for translating executables.
 def CPU_Default
     : StrEnumAttrCase<"CPUDefault">;
-def CPU_Vectorization
-    : StrEnumAttrCase<"CPUVectorization">;
 def CPU_TensorToVectors
     : StrEnumAttrCase<"CPUTensorToVectors">;
 def CPU_TileFuseAndVectorize
@@ -42,8 +40,8 @@
     "DispatchLoweringPassPipeline",
     "identifier for pass pipeline use to lower dispatch region",
     [CPU_Default, CPU_TensorToVectors, CPU_TileFuseAndVectorize,
-     CPU_Vectorization, LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize,
-     LLVMGPU_MatmulSimt, SPIRV_SimpleDistribute, SPIRV_Vectorize,
+     LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize, LLVMGPU_MatmulSimt,
+     SPIRV_SimpleDistribute, SPIRV_Vectorize,
      SPIRV_VectorizeToCooperativeOps, None]> {
   let cppNamespace = "::mlir::iree_compiler::IREE::Codegen";
 }
diff --git a/iree/compiler/Codegen/LLVMCPU/BUILD b/iree/compiler/Codegen/LLVMCPU/BUILD
index 368243a..97c33b6 100644
--- a/iree/compiler/Codegen/LLVMCPU/BUILD
+++ b/iree/compiler/Codegen/LLVMCPU/BUILD
@@ -20,7 +20,6 @@
         "LLVMCPUTileAndVectorizeLinalgTensorOps.cpp",
         "LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp",
         "LLVMCPUUnfuseFMAOps.cpp",
-        "LLVMCPUVectorization.cpp",
         "Passes.cpp",
         "VectorContractToAArch64InlineAsmOp.cpp",
     ],
diff --git a/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
index 4e7ff7a..d2061de 100644
--- a/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
+++ b/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
@@ -23,7 +23,6 @@
     "LLVMCPUTileAndVectorizeLinalgTensorOps.cpp"
     "LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp"
     "LLVMCPUUnfuseFMAOps.cpp"
-    "LLVMCPUVectorization.cpp"
     "Passes.cpp"
     "VectorContractToAArch64InlineAsmOp.cpp"
   DEPS
diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
index a818279..72078cd 100644
--- a/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
@@ -183,9 +183,6 @@
           case IREE::Codegen::DispatchLoweringPassPipeline::None:
             addCPUDefaultPassPipeline(nestedModulePM);
             break;
-          case IREE::Codegen::DispatchLoweringPassPipeline::CPUVectorization:
-            addCPUVectorizationPassPipeline(nestedModulePM, lowerToVectors);
-            break;
           case IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors:
             addTensorToVectorsPassPipeline(nestedModulePM, lowerToVectors);
             break;
diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorization.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorization.cpp
deleted file mode 100644
index d27153e..0000000
--- a/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorization.cpp
+++ /dev/null
@@ -1,223 +0,0 @@
-// Copyright 2020 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"
-#include "iree/compiler/Codegen/PassDetail.h"
-#include "iree/compiler/Codegen/Passes.h"
-#include "iree/compiler/Codegen/Transforms/Transforms.h"
-#include "iree/compiler/Codegen/Utils/MarkerUtils.h"
-#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
-#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
-#include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"
-#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/MemRef/Transforms/Passes.h"
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
-#include "mlir/Dialect/Vector/VectorTransforms.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-#define DEBUG_TYPE "iree-linalg-to-llvm-tile-and-vectorize"
-
-namespace mlir {
-namespace iree_compiler {
-
-namespace {
-// Could just be linalg::TilingPattern with a ContractionOpInterface filter, but
-// that is always templated on an op.
-struct TileWorkgroups : public linalg::LinalgBaseTilingPattern {
-  using Base = linalg::LinalgBaseTilingPattern;
-  TileWorkgroups(MLIRContext *context, linalg::LinalgTilingOptions options,
-                 linalg::LinalgTransformationFilter marker)
-      : LinalgBaseTilingPattern(context, options, marker) {}
-  LogicalResult matchAndRewrite(Operation *op,
-                                PatternRewriter &rewriter) const override {
-    auto contractionOp = dyn_cast<linalg::ContractionOpInterface>(op);
-    if (!contractionOp) return failure();
-
-    linalg::TiledLinalgOp tiledLinalgOp;
-    if (failed(Base::matchAndRewriteBase(op, rewriter, tiledLinalgOp)) ||
-        !tiledLinalgOp.tensorResults.empty()) {
-      return failure();
-    }
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-
-}  // namespace
-
-namespace {
-struct LLVMCPUVectorizationPass
-    : public LLVMCPUVectorizationBase<LLVMCPUVectorizationPass> {
-  LLVMCPUVectorizationPass(bool vectorize = true) : lowerToVectors(vectorize) {}
-  LLVMCPUVectorizationPass(const LLVMCPUVectorizationPass &pass) {
-    lowerToVectors = pass.lowerToVectors;
-  }
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<linalg::LinalgDialect, AffineDialect, scf::SCFDialect,
-                    vector::VectorDialect>();
-  }
-  void runOnOperation() override;
-
- private:
-  /// TODO(ravishankarm): Option to not generate any `vector.` instructions. The
-  /// VMVX backend uses the same lowering as the CPU pass but there is no
-  /// lowering of these `vector.` operations to scalar code. So as a WAR do the
-  /// same tiling scheme but avoid generating vector instructions. When VMVX can
-  /// handle vector instructions, drop this options.
-  bool lowerToVectors;
-
-  Option<bool> enableVectorContractToAarch64Asm{
-      *this, "vector-contract-to-aarch64-asm",
-      llvm::cl::desc("Enable promoting wokgroup memory to full tiles allocated "
-                     "on the stack."),
-      llvm::cl::init(false)};
-};
-}  // namespace
-
-void LLVMCPUVectorizationPass::runOnOperation() {
-  auto funcOp = getOperation();
-  MLIRContext *context = &getContext();
-
-  // Workgroup first level of tiling.
-  {
-    // First level of tiling patterns. (workgroups memory)
-    RewritePatternSet l1patterns(context);
-    l1patterns.insert<TileWorkgroups>(
-        context,
-        linalg::LinalgTilingOptions().setTileSizeComputationFunction(
-            [](OpBuilder &builder, Operation *op) -> SmallVector<Value, 4> {
-              return getTileSizes(builder, op,
-                                  static_cast<unsigned>(TilingLevel::L1Tiles));
-            }),
-        linalg::LinalgTransformationFilter(
-            ArrayRef<Identifier>{},
-            Identifier::get(getWorkgroupL1TileMarker(), context)));
-
-    (void)applyPatternsAndFoldGreedily(funcOp, std::move(l1patterns));
-  }
-
-  // Second level of tiling. (workgroups memory -> vectors)
-  {
-    RewritePatternSet l2patterns(context);
-    l2patterns.insert<TileWorkgroups>(
-        context,
-        linalg::LinalgTilingOptions().setTileSizeComputationFunction(
-            [](OpBuilder &builder, Operation *op) -> SmallVector<Value, 4> {
-              return getTileSizes(
-                  builder, op, static_cast<unsigned>(TilingLevel::VectorTiles));
-            }),
-        linalg::LinalgTransformationFilter(
-            Identifier::get(getWorkgroupL1TileMarker(), context),
-            Identifier::get(getVectorizeMarker(), context)));
-
-    (void)applyPatternsAndFoldGreedily(funcOp, std::move(l2patterns));
-  }
-
-  // Apply canonicalization.
-  {
-    RewritePatternSet canonicalizationPatterns =
-        linalg::getLinalgTilingCanonicalizationPatterns(context);
-    populateAffineMinCanonicalizationPattern(canonicalizationPatterns);
-    if (failed(applyPatternsAndFoldGreedily(
-            funcOp, std::move(canonicalizationPatterns)))) {
-      return signalPassFailure();
-    }
-  }
-
-  if (!lowerToVectors) {
-    return;
-  }
-
-  // Op specific conversion.
-  {
-    RewritePatternSet vectorizeOpsPattenrs(context);
-    populateLinalgToVectorVectorizeMMT4dPatterns(context, vectorizeOpsPattenrs);
-    if (failed(applyPatternsAndFoldGreedily(funcOp,
-                                            std::move(vectorizeOpsPattenrs)))) {
-      return signalPassFailure();
-    }
-  }
-
-  // Apply vectorization patterns.
-  {
-    RewritePatternSet vectorizationPatterns(context);
-    linalg::insertVectorizationPatterns<linalg::ContractionOpInterface,
-                                        linalg::CopyOp, linalg::FillOp>(
-        vectorizationPatterns, linalg::LinalgVectorizationOptions(),
-        linalg::LinalgTransformationFilter(
-            Identifier::get(getVectorizeMarker(), context)));
-    vector::populateVectorTransferPermutationMapLoweringPatterns(
-        vectorizationPatterns);
-    vector::populateVectorReductionToContractPatterns(vectorizationPatterns);
-    (void)applyPatternsAndFoldGreedily(funcOp,
-                                       std::move(vectorizationPatterns));
-  }
-
-  {
-    // Fold consumer add ops into the contraction op itself.
-    RewritePatternSet canonicalizationPatterns(context);
-    vector::ContractionOp::getCanonicalizationPatterns(canonicalizationPatterns,
-                                                       context);
-    (void)applyPatternsAndFoldGreedily(funcOp,
-                                       std::move(canonicalizationPatterns));
-  }
-
-  if (enableVectorContractToAarch64Asm) {
-    RewritePatternSet vectorToAArch64AsmPatterns(context);
-    populateVectorContractToAArch64InlineAsm(vectorToAArch64AsmPatterns,
-                                             context);
-    (void)applyPatternsAndFoldGreedily(funcOp,
-                                       std::move(vectorToAArch64AsmPatterns));
-  }
-
-  // Apply vector specific operation lowering.
-  {
-    vector::VectorTransformsOptions vectorTransformsOptions =
-        vector::VectorTransformsOptions().setVectorTransformsOptions(
-            vector::VectorContractLowering::OuterProduct);
-    RewritePatternSet vectorContractLoweringPatterns(context);
-    vectorContractLoweringPatterns.insert<
-        vector::ContractionOpToOuterProductOpLowering,
-        vector::ContractionOpToMatmulOpLowering, vector::ContractionOpLowering>(
-        vectorTransformsOptions, context);
-    vector::populateVectorTransferPermutationMapLoweringPatterns(
-        vectorContractLoweringPatterns);
-    (void)applyPatternsAndFoldGreedily(
-        funcOp, std::move(vectorContractLoweringPatterns));
-  }
-
-  // Hosit hierarchical tiling indexing and other loop invariant transfer
-  // ops computation.
-
-  // Programmatic controlled lowering of vector.transfer only.
-  {
-    VectorTransferToSCFOptions vectorToSCFOptions =
-        VectorTransferToSCFOptions().enableFullUnroll();
-    RewritePatternSet vectorToLoopsPatterns(context);
-    populateVectorToSCFConversionPatterns(vectorToLoopsPatterns,
-                                          vectorToSCFOptions);
-    // Hosit hierarchical tiling indexing and other loop invariant transfer
-    // ops computation.
-    linalg::hoistRedundantVectorTransfers(funcOp);
-
-    memref::populateFoldSubViewOpPatterns(vectorToLoopsPatterns);
-    (void)applyPatternsAndFoldGreedily(funcOp,
-                                       std::move(vectorToLoopsPatterns));
-  }
-}
-
-std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUVectorizationPass(
-    bool lowerToVectors) {
-  return std::make_unique<LLVMCPUVectorizationPass>(lowerToVectors);
-}
-
-}  // namespace iree_compiler
-}  // namespace mlir
diff --git a/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 195e3fe..cb0a934 100644
--- a/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -27,28 +27,6 @@
   return builder.create<memref::AllocaOp>(loc, allocType, dynamicSizes);
 }
 
-void addCPUVectorizationPassPipeline(OpPassManager &passManager,
-                                     bool lowerToVectors) {
-  passManager.addPass(createCanonicalizerPass());
-
-  // TODO(ataei): This causes segmentation fault on Android. Fix it and
-  // re-enable.
-  // passManager.addNestedPass<FuncOp>(createPadLinalgWorkgroupTilesPass());
-
-  // Use stack allocation on CPU side.
-  addLinalgBufferizePasses(passManager, cpuAllocationFunction);
-  passManager.addNestedPass<FuncOp>(createCSEPass());
-  passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
-
-  // Tile and vectorize linalg ops on buffers.
-  passManager.addNestedPass<FuncOp>(
-      createLLVMCPUVectorizationPass(lowerToVectors));
-  passManager.addNestedPass<FuncOp>(createCSEPass());
-  passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
-
-  passManager.addNestedPass<FuncOp>(createForOpCanonicalizationPass());
-}
-
 LogicalResult verifyTensorToVectorsPassPipelineConfig(
     Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
     IREE::Codegen::TranslationInfoAttr translationInfo,
diff --git a/iree/compiler/Codegen/LLVMCPU/test/BUILD b/iree/compiler/Codegen/LLVMCPU/test/BUILD
index 466d805..342b3ca 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/BUILD
+++ b/iree/compiler/Codegen/LLVMCPU/test/BUILD
@@ -24,7 +24,6 @@
             "hal_interface_workgroup_info.mlir",
             "illegal_configuration.mlir",
             "materialize_launch_configuration.mlir",
-            "matmul_vectorization.mlir",
             "synchronize_symbol_visibility.mlir",
             "test_config_mmt4d.mlir",
             "tile_and_vectorize.mlir",
diff --git a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
index 985c7e6..41f64cb 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
+++ b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
@@ -19,7 +19,6 @@
     "hal_interface_workgroup_info.mlir"
     "illegal_configuration.mlir"
     "materialize_launch_configuration.mlir"
-    "matmul_vectorization.mlir"
     "synchronize_symbol_visibility.mlir"
     "test_config_mmt4d.mlir"
     "tile_and_vectorize.mlir"
diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
index 1e7a7ef..a521311 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
@@ -19,16 +19,13 @@
       func @matmul_tensors() {
         %c0 = arith.constant 0 : index
         %c1 = arith.constant 1 : index
-        %pcM = hal.interface.load.constant offset = 0 : index
-        %pcN = hal.interface.load.constant offset = 1 : index
-        %pcK = hal.interface.load.constant offset = 2 : index
-        %0 = hal.interface.binding.subspan @io::@arg0[%c0] : memref<?x?xf32>{%pcM, %pcK}
-        %2 = hal.interface.binding.subspan @io::@arg1[%c0] : memref<?x?xf32>{%pcK, %pcN}
-        %4 = hal.interface.binding.subspan @io::@arg2[%c0] : memref<?x?xf32>{%pcM, %pcN}
-        %6 = hal.interface.binding.subspan @io::@ret0[%c0] : memref<?x?xf32>{%pcM, %pcN}
-        %M = memref.dim %0, %c0 : memref<?x?xf32>
-        %N = memref.dim %2, %c1 : memref<?x?xf32>
-        %K = memref.dim %0, %c1 : memref<?x?xf32>
+        %M = hal.interface.load.constant offset = 0 : index
+        %N = hal.interface.load.constant offset = 1 : index
+        %K = hal.interface.load.constant offset = 2 : index
+        %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K}
+        %2 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N}
+        %4 = hal.interface.binding.subspan @io::@arg2[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %N}
+        %6 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%M, %N}
         %workgroup_size_x = hal.interface.workgroup.size[0] : index
         %workgroup_size_y = hal.interface.workgroup.size[1] : index
         %workgroup_id_x = hal.interface.workgroup.id[0] : index
@@ -42,15 +39,12 @@
           %11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_size_x, %workgroup_count_x]
           scf.for %arg1 = %10 to %N step %11 {
             %12 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %N]
-            %13 = memref.subview %0[%arg0, 0] [%12, %K] [1, 1] : memref<?x?xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
+            %13 = flow.dispatch.tensor.load %0, offsets=[%arg0, 0], sizes=[%12, %K], strides=[1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
             %14 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %M]
-            %15 = memref.subview %2[0, %arg1] [%K, %14] [1, 1] : memref<?x?xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
-            %16 = memref.subview %4[%arg0, %arg1] [%12, %14] [1, 1] : memref<?x?xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
-            %17 = memref.alloc(%12, %14) : memref<?x?xf32>
-            linalg.copy(%16, %17) : memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, memref<?x?xf32>
-            linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%13, %15 : memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>) outs(%17 : memref<?x?xf32>)
-            %18 = memref.subview %6[%arg0, %arg1] [%12, %14] [1, 1] : memref<?x?xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
-            linalg.copy(%17, %18) : memref<?x?xf32>, memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
+            %15 = flow.dispatch.tensor.load %2, offsets=[0, %arg1], sizes=[%K, %14], strides=[1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
+            %16 = flow.dispatch.tensor.load %4, offsets=[%arg0, %arg1], sizes=[%12, %14], strides=[1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
+            %17 = linalg.matmul ins(%13, %15 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
+            flow.dispatch.tensor.store %17, %6, offsets=[%arg0, %arg1], sizes=[%12, %14], strides=[1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
           }
         }
         return
@@ -96,19 +90,23 @@
         %c0 = arith.constant 0 : index
         %dim0 = hal.interface.load.constant offset = 0 : index
         %dim1 = hal.interface.load.constant offset = 1 : index
-        %0 = hal.interface.binding.subspan @io::@arg0[%c0] : memref<?x?xf32>{%dim0, %dim1}
-        %1 = hal.interface.binding.subspan @io::@arg1[%c0] : memref<?xf32>{%dim1}
-        %2 = hal.interface.binding.subspan @io::@ret0[%c0] : memref<?x?xf32>{%dim0, %dim1}
-        linalg.generic {
+        %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%dim0, %dim1}
+        %1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?xf32>{%dim1}
+        %2 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%dim0, %dim1}
+        %3 = flow.dispatch.tensor.load %0, offsets=[], sizes=[], strides=[] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
+        %4 = flow.dispatch.tensor.load %1, offsets=[], sizes=[], strides=[] : !flow.dispatch.tensor<readonly:?xf32> -> tensor<?xf32>
+        %5 = linalg.init_tensor [%dim0, %dim1] : tensor<?x?xf32>
+        %6 = linalg.generic {
           indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                            affine_map<(d0, d1) -> (d1)>,
                            affine_map<(d0, d1) -> (d0, d1)>],
           iterator_types = ["parallel", "parallel"]}
-          ins(%0, %1 : memref<?x?xf32>, memref<?xf32>) outs(%2 : memref<?x?xf32>) {
+          ins(%3, %4 : tensor<?x?xf32>, tensor<?xf32>) outs(%5 : tensor<?x?xf32>) {
           ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):  // no predecessors
-            %3 = arith.addf %arg0, %arg1 : f32
-            linalg.yield %3 : f32
-          }
+            %7 = arith.addf %arg0, %arg1 : f32
+            linalg.yield %7 : f32
+          } -> tensor<?x?xf32>
+        flow.dispatch.tensor.store %6, %2, offsets = [], sizes = [], strides = [] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
         return
       }
       hal.interface private @io  {
@@ -396,8 +394,8 @@
 // -----
 
 #compilation = #iree_codegen.compilation.info<
-    #iree_codegen.lowering.config<tile_sizes = [[32, 32, 32]], native_vector_size = []>,
-    #iree_codegen.translation.info<"CPUVectorization", workload_per_wg = [32, 32]>,
+    #iree_codegen.lowering.config<tile_sizes = [[], [32, 32, 32], [4, 4, 4]], native_vector_size = [4, 4, 4]>,
+    #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [32, 32]>,
     workgroup_size = []>
 hal.executable private @preset_config_matmul_tensors  {
   hal.executable.variant @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64"> {
@@ -451,10 +449,10 @@
     }
   }
 }
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[32, 32, 32]{{\]}}, native_vector_size = []>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[], [32, 32, 32], [4, 4, 4]{{\]}}, native_vector_size = [4, 4, 4]>
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 32)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 * 32)>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUVectorization", workload_per_wg = [32, 32]>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [32, 32]>
 //      CHECK: hal.executable.entry_point
 // CHECK-SAME:     translation.info = #[[TRANSLATION]]
 // CHECK-NEXT:   ^bb0(%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index
diff --git a/iree/compiler/Codegen/LLVMCPU/test/matmul_vectorization.mlir b/iree/compiler/Codegen/LLVMCPU/test/matmul_vectorization.mlir
deleted file mode 100644
index 80d6213..0000000
--- a/iree/compiler/Codegen/LLVMCPU/test/matmul_vectorization.mlir
+++ /dev/null
@@ -1,143 +0,0 @@
-// RUN: iree-opt -pass-pipeline="hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target{use-lowering-pipeline='builtin.func(iree-llvmcpu-vectorization)'}))" -split-input-file %s | IreeFileCheck %s
-
-#config = #iree_codegen.lowering.config<tile_sizes = [[64, 64], [32, 32, 32], [4, 4, 4]], native_vector_size = [4, 4, 4]>
-hal.executable private @dynamic_matmul  {
-  hal.interface @io {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-  }
-  hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64"> {
-    hal.executable.entry_point @matmul_128x128x128 attributes {
-      interface = @io,
-      ordinal = 0 : index
-    }
-    builtin.module {
-      func @matmul_128x128x128() {
-        %c0 = arith.constant 0 : index
-        %c128 = arith.constant 128 : index
-        %0 = hal.interface.binding.subspan @io::@arg0[%c0] : memref<128x128xf32>
-        %1 = hal.interface.binding.subspan @io::@arg1[%c0] : memref<128x128xf32>
-        %2 = hal.interface.binding.subspan @io::@ret0[%c0] : memref<128x128xf32>
-        %workgroup_id_x = hal.interface.workgroup.id[0] : index
-        %workgroup_count_x = hal.interface.workgroup.count[0] : index
-        %workgroup_id_y = hal.interface.workgroup.id[1] : index
-        %workgroup_count_y = hal.interface.workgroup.count[1] : index
-        %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
-        %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
-        scf.for %arg0 = %3 to %c128 step %4 {
-          %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
-          %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
-          scf.for %arg1 = %5 to %c128 step %6 {
-            %7 = memref.subview %0[%arg0, 0] [64, 128] [1, 1] : memref<128x128xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
-            %8 = memref.subview %1[0, %arg1] [128, 64] [1, 1] : memref<128x128xf32> to memref<128x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
-            %9 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32> to memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
-            linalg.matmul {lowering.config = #config} ins(%7, %8 : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>, memref<128x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>) outs(%9 : memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>)
-          }
-        }
-        return
-      }
-    }
-  }
-}
-// CHECK-LABEL: func @matmul_128x128x128
-//   CHECK-DAG:   %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0
-//   CHECK-DAG:   %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1
-//   CHECK-DAG:   %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0
-//   CHECK-DAG:   %[[START:.+]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[WORGKROUP_SIZE:.+]] = arith.constant 64
-//   CHECK-DAG:   %[[VECTOR_SIZE:.+]] = arith.constant 4
-//   CHECK-DAG:   %[[L1_SIZE:.+]] = arith.constant 32
-//   CHECK-DAG:   %[[KDIM_SIZE:.+]] = arith.constant 128
-//       CHECK:   scf.for
-//       CHECK:     scf.for
-//       CHECK:       scf.for {{.*}} = %[[START]] to %[[WORGKROUP_SIZE]] step %[[L1_SIZE]] {
-//       CHECK:         scf.for {{.*}} = %[[START]] to %[[WORGKROUP_SIZE]] step %[[L1_SIZE]] {
-//       CHECK:           scf.for {{.*}} = %[[START]] to %[[KDIM_SIZE]] step %[[L1_SIZE]] {
-//       CHECK:             scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] {
-//       CHECK:               scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] {
-//       CHECK:                 %[[VEC_C_0:.+]] = vector.transfer_read %[[RET0]]
-//       CHECK:                 %[[VEC_C_1:.+]] = vector.transfer_read %[[RET0]]
-//       CHECK:                 %[[VEC_C_2:.+]] = vector.transfer_read %[[RET0]]
-//       CHECK:                 %[[VEC_C_3:.+]] = vector.transfer_read %[[RET0]]
-//       CHECK:                 scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]]
-//       CHECK:                   %[[VEC_A_0:.+]] = vector.transfer_read %[[ARG0]]
-//       CHECK:                   %[[VEC_A_1:.+]] = vector.transfer_read %[[ARG0]]
-//       CHECK:                   %[[VEC_A_2:.+]] = vector.transfer_read %[[ARG0]]
-//       CHECK:                   %[[VEC_A_3:.+]] = vector.transfer_read %[[ARG0]]
-//       CHECK:                   %[[VEC_B_0:.+]] = vector.transfer_read %[[ARG1]]
-//       CHECK:                   %[[VEC_b_1:.+]] = vector.transfer_read %[[ARG1]]
-//       CHECK:                   %[[VEC_B_2:.+]] = vector.transfer_read %[[ARG1]]
-//       CHECK:                   %[[VEC_B_3:.+]] = vector.transfer_read %[[ARG1]]
-
-// -----
-
-#config = #iree_codegen.lowering.config<tile_sizes = [[64, 64], [32, 32, 32], [4, 4, 4]], native_vector_size = [4, 4, 4]>
-hal.executable private @matmul_i8_i8_i32  {
-  hal.interface @io {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-  }
-  hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64"> {
-    hal.executable.entry_point @matmul_i8_i8_i32_128x128x128 attributes {
-      interface = @io,
-      ordinal = 0 : index
-    }
-    builtin.module {
-      func @matmul_i8_i8_i32_128x128x128() {
-        %c0 = arith.constant 0 : index
-        %c128 = arith.constant 128 : index
-        %0 = hal.interface.binding.subspan @io::@arg0[%c0] : memref<128x128xi8>
-        %1 = hal.interface.binding.subspan @io::@arg1[%c0] : memref<128x128xi8>
-        %2 = hal.interface.binding.subspan @io::@ret0[%c0] : memref<128x128xi32>
-        %workgroup_id_x = hal.interface.workgroup.id[0] : index
-        %workgroup_count_x = hal.interface.workgroup.count[0] : index
-        %workgroup_id_y = hal.interface.workgroup.id[1] : index
-        %workgroup_count_y = hal.interface.workgroup.count[1] : index
-        %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
-        %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
-        scf.for %arg0 = %3 to %c128 step %4 {
-          %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
-          %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
-          scf.for %arg1 = %5 to %c128 step %6 {
-            %7 = memref.subview %0[%arg0, 0] [64, 128] [1, 1] : memref<128x128xi8> to memref<64x128xi8, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
-            %8 = memref.subview %1[0, %arg1] [128, 64] [1, 1] : memref<128x128xi8> to memref<128x64xi8, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
-            %9 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32> to memref<64x64xi32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
-            linalg.matmul {lowering.config = #config} ins(%7, %8 : memref<64x128xi8, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>, memref<128x64xi8, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>) outs(%9 : memref<64x64xi32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>)
-          }
-        }
-        return
-      }
-    }
-  }
-}
-// CHECK-LABEL: func @matmul_i8_i8_i32_128x128x128
-//   CHECK-DAG:   %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0
-//   CHECK-DAG:   %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1
-//   CHECK-DAG:   %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0
-//   CHECK-DAG:   %[[START:.+]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[WORGKROUP_SIZE:.+]] = arith.constant 64
-//   CHECK-DAG:   %[[VECTOR_SIZE:.+]] = arith.constant 4
-//   CHECK-DAG:   %[[L1_SIZE:.+]] = arith.constant 32
-//   CHECK-DAG:   %[[KDIM_SIZE:.+]] = arith.constant 128
-//       CHECK:   scf.for
-//       CHECK:     scf.for
-//       CHECK:       scf.for {{.*}} = %[[START]] to %[[WORGKROUP_SIZE]] step %[[L1_SIZE]] {
-//       CHECK:         scf.for {{.*}} = %[[START]] to %[[WORGKROUP_SIZE]] step %[[L1_SIZE]] {
-//       CHECK:           scf.for {{.*}} = %[[START]] to %[[KDIM_SIZE]] step %[[L1_SIZE]] {
-//       CHECK:             scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] {
-//       CHECK:               scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] {
-//       CHECK:                 %[[VEC_C_0:.+]] = vector.transfer_read %[[RET0]]
-//       CHECK:                 %[[VEC_C_1:.+]] = vector.transfer_read %[[RET0]]
-//       CHECK:                 %[[VEC_C_2:.+]] = vector.transfer_read %[[RET0]]
-//       CHECK:                 %[[VEC_C_3:.+]] = vector.transfer_read %[[RET0]]
-//       CHECK:                   scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]]
-//       CHECK:                     %[[VEC_A_0:.+]] = vector.transfer_read %[[ARG0]]
-//       CHECK:                     %[[VEC_A_1:.+]] = vector.transfer_read %[[ARG0]]
-//       CHECK:                     %[[VEC_A_2:.+]] = vector.transfer_read %[[ARG0]]
-//       CHECK:                     %[[VEC_A_3:.+]] = vector.transfer_read %[[ARG0]]
-//       CHECK:                     %[[VEC_B_0:.+]] = vector.transfer_read %[[ARG1]]
-//       CHECK:                     %[[VEC_b_1:.+]] = vector.transfer_read %[[ARG1]]
-//       CHECK:                     %[[VEC_B_2:.+]] = vector.transfer_read %[[ARG1]]
-//       CHECK:                     %[[VEC_B_3:.+]] = vector.transfer_read %[[ARG1]]
diff --git a/iree/compiler/Codegen/Passes.h b/iree/compiler/Codegen/Passes.h
index 06f1f13..516be98 100644
--- a/iree/compiler/Codegen/Passes.h
+++ b/iree/compiler/Codegen/Passes.h
@@ -191,11 +191,6 @@
 /// to memrefs
 void addCPUDefaultPassPipeline(OpPassManager &passManager);
 
-/// Populates the passes needed to lower to vector operations using linalg based
-/// progressive lowering with vectorization after bufferization.
-void addCPUVectorizationPassPipeline(OpPassManager &passManager,
-                                     bool lowerToVectors = true);
-
 /// Populates the passes needed to multi level tile and lowering of linalg ops
 /// on tensors to vectors operations.
 LogicalResult verifyTensorToVectorsPassPipelineConfig(
diff --git a/iree/compiler/Codegen/Passes.td b/iree/compiler/Codegen/Passes.td
index ef1570f..0c8f923 100644
--- a/iree/compiler/Codegen/Passes.td
+++ b/iree/compiler/Codegen/Passes.td
@@ -153,12 +153,6 @@
   let constructor = "mlir::iree_compiler::createLLVMCPUUnfuseFMAOpsPass()";
 }
 
-def LLVMCPUVectorization :
-    Pass<"iree-llvmcpu-vectorization", "FuncOp"> {
-  let summary = "Tile and vectorize for CPU backends";
-  let constructor = "mlir::iree_compiler::createLLVMCPUVectorizationPass()";
-}
-
 def VectorToAArch64InlineAsm :
     Pass<"iree-llvmcpu-vector-to-aarch64-inline-asm", "FuncOp"> {
   let summary = "Convert vector operations to aarch64 inline asm LLVMIR dialect";
diff --git a/iree/test/e2e/regression/lowering_config.mlir b/iree/test/e2e/regression/lowering_config.mlir
index 17b401f..ab4857d 100644
--- a/iree/test/e2e/regression/lowering_config.mlir
+++ b/iree/test/e2e/regression/lowering_config.mlir
@@ -1,10 +1,10 @@
 #compilation0 = #iree_codegen.compilation.info<
-    #iree_codegen.lowering.config<tile_sizes = [[32, 32, 32]], native_vector_size = []>,
-    #iree_codegen.translation.info<"CPUVectorization", workload_per_wg = [32, 32]>,
+    #iree_codegen.lowering.config<tile_sizes = [[], [32, 32, 32], [4, 4, 4]], native_vector_size = [4, 4, 4]>,
+    #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [32, 32]>,
     workgroup_size = []>
 #compilation1 = #iree_codegen.compilation.info<
-    #iree_codegen.lowering.config<tile_sizes = [[64, 64, 64]], native_vector_size = []>,
-    #iree_codegen.translation.info<"CPUVectorization", workload_per_wg = [64, 64]>,
+    #iree_codegen.lowering.config<tile_sizes = [[], [64, 64, 64], [16, 16, 16]], native_vector_size = [16, 16, 16]>,
+    #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [64, 64]>,
     workgroup_size = []>
 func @lowering_config_test() {
   %a = util.unfoldable_constant dense<1.0> : tensor<128x256xf32>