Merge main -> google * 8b3790d5b [spirv] Remove code for deprecated Linalg on buffers path (#5626) * 5e66689d3 Guessing at making MacOS happy. (#5654) * 9475b3aeb Merge google -> main (#5646) * e221e5e9e [Talks] 2020-08-20: IREE CodeGen; MLIR ODM (#5585) * 1696d8c42 Only add operand-fusion to MobileNetV2. (#5639) * 9eaa01bbd Integrate MLIR-EmitC at iml130/mlir-emitc@679d7183 (#5633) * 7d0ab9ad3 [spirv] Delete experimental matmul vectorization code (#5638) PiperOrigin-RevId: 370960412

diff --git a/README.md b/README.md
index 71189a6..4d5a462 100644
--- a/README.md
+++ b/README.md

@@ -58,9 +58,13 @@
 
 We also have some public talks that explain IREE's concepts and architecture:
 
+*   2020-08-20: IREE CodeGen: MLIR Open Design Meeting Presentation
+    ([recording](https://drive.google.com/file/d/1325zKXnNIXGw3cdWrDWJ1-bp952wvC6W/view?usp=sharing)
+    and
+    [slides](https://docs.google.com/presentation/d/1NetHjKAOYg49KixY5tELqFp6Zr2v8_ujGzWZ_3xvqC8/edit))
 *   2020-03-18: Interactive HAL IR Walkthrough (Ben Vanik and core team)
     ([recording](https://drive.google.com/file/d/1_sWDgAPDfrGQZdxAapSA90AD1jVfhp-f/view?usp=sharing))
-*   2020-01-31: End-to-end MLIR Workflow in IREE
+*   2020-01-31: End-to-end MLIR Workflow in IREE: MLIR Open Design Meeting Presentation
     ([recording](https://drive.google.com/open?id=1os9FaPodPI59uj7JJI3aXnTzkuttuVkR)
     and
     [slides](https://drive.google.com/open?id=1RCQ4ZPQFK9cVgu3IH1e5xbrBcqy7d_cEZ578j84OvYI))

diff --git a/SUBMODULE_VERSIONS.txt b/SUBMODULE_VERSIONS.txt
index bdb740e..abb0e2d 100644
--- a/SUBMODULE_VERSIONS.txt
+++ b/SUBMODULE_VERSIONS.txt

@@ -6,7 +6,7 @@
 88b845dee001723c4a0db1fe5477de735b6d3bb0 third_party/liburing
 3d4a47eed849f540090e9699e1b4860977558c76 third_party/llvm-bazel
 bf9eef92b6cd71d262ac12ce6c4919271bd6c910 third_party/llvm-project
-3c265bf59bf2515a63ec35571c66954349749a62 third_party/mlir-emitc
+679d7183b657a24f48d16de1fcefb20d7cd1f6a2 third_party/mlir-emitc
 b2a23bf269d52976ff384a60a12826b541f1ebbe third_party/mlir-hlo
 2b2bd45bbf9be04fd22ece5cc1f54679202e9257 third_party/pffft
 d8c7ee00a687ac369e62e2032514a93a9b413502 third_party/pybind11

diff --git a/build_tools/mako/configuration.py b/build_tools/mako/configuration.py
index 6ad1d6f..256848d 100644
--- a/build_tools/mako/configuration.py
+++ b/build_tools/mako/configuration.py

@@ -94,11 +94,15 @@
     self.phones = phones
 
 
-def get_pixel4_default_target_list(skipped_target=None, batch_config=None):
+def get_pixel4_default_target_list(skipped_target=None,
+                                   batch_config=None,
+                                   compilation_flags=None):
   if skipped_target is None:
     skipped_target = []
   if batch_config is None:
     batch_config = []
+  if compilation_flags is None:
+    compilation_flags = []
   targets = [
       TargetInfo(driver="vmla",
                  hal_target_backend="vmla",
@@ -111,7 +115,6 @@
                  compilation_flags=[
                      "--iree-llvm-target-triple=aarch64-none-linux-android29",
                      "--iree-flow-inline-constants-max-byte-length=2048",
-                     "--iree-flow-dispatch-formation-enable-operand-fusion"
                  ]),
       TargetInfo(driver="dylib",
                  hal_target_backend="dylib-llvm-aot",
@@ -120,7 +123,6 @@
                  compilation_flags=[
                      "--iree-llvm-target-triple=aarch64-none-linux-android29",
                      "--iree-flow-inline-constants-max-byte-length=2048",
-                     "--iree-flow-dispatch-formation-enable-operand-fusion"
                  ],
                  runtime_flags=[
                      "--dylib_worker_count=3",
@@ -140,14 +142,20 @@
   for target in targets:
     if target.mako_tag in batch_config:
       target.add_batch_flag(batch_config[target.mako_tag])
+    if target.mako_tag in compilation_flags:
+      target.compilation_flags += compilation_flags[target.mako_tag]
   return targets
 
 
-def get_s20_default_target_list(skipped_target=None, batch_config=None):
+def get_s20_default_target_list(skipped_target=None,
+                                batch_config=None,
+                                compilation_flags=None):
   if skipped_target is None:
     skipped_target = []
   if batch_config is None:
     batch_config = []
+  if compilation_flags is None:
+    compilation_flags = []
   targets = [
       TargetInfo(driver="vmla",
                  hal_target_backend="vmla",
@@ -160,7 +168,6 @@
                  compilation_flags=[
                      "--iree-llvm-target-triple=aarch64-none-linux-android29",
                      "--iree-flow-inline-constants-max-byte-length=2048",
-                     "--iree-flow-dispatch-formation-enable-operand-fusion"
                  ]),
       TargetInfo(driver="dylib",
                  hal_target_backend="dylib-llvm-aot",
@@ -169,7 +176,6 @@
                  compilation_flags=[
                      "--iree-llvm-target-triple=aarch64-none-linux-android29",
                      "--iree-flow-inline-constants-max-byte-length=2048",
-                     "--iree-flow-dispatch-formation-enable-operand-fusion"
                  ],
                  runtime_flags=[
                      "--dylib_worker_count=3",
@@ -190,6 +196,8 @@
   for target in targets:
     if target.mako_tag in batch_config:
       target.add_batch_flag(batch_config[target.mako_tag])
+    if target.mako_tag in compilation_flags:
+      target.compilation_flags += compilation_flags[target.mako_tag]
   return targets
 
 
@@ -221,13 +229,31 @@
         model_path="mobilenet-v2/iree_input.mlir",
         flagfile_path="mobilenet-v2/flagfile",
         phones=[
-            PhoneBenchmarkInfo(name="Pixel4",
-                               benchmark_key="6338759231537152",
-                               targets=get_pixel4_default_target_list(
-                                   skipped_target=["vlk2"])),
-            PhoneBenchmarkInfo(name="S20",
-                               benchmark_key="5618403088793600",
-                               targets=get_s20_default_target_list()),
+            PhoneBenchmarkInfo(
+                name="Pixel4",
+                benchmark_key="6338759231537152",
+                targets=get_pixel4_default_target_list(
+                    skipped_target=["vlk2"],
+                    compilation_flags={
+                        'cpu': [
+                            "--iree-flow-dispatch-formation-enable-operand-fusion"
+                        ],
+                        'cpu3t': [
+                            "--iree-flow-dispatch-formation-enable-operand-fusion"
+                        ]
+                    })),
+            PhoneBenchmarkInfo(
+                name="S20",
+                benchmark_key="5618403088793600",
+                targets=get_s20_default_target_list(
+                    compilation_flags={
+                        'cpu': [
+                            "--iree-flow-dispatch-formation-enable-operand-fusion"
+                        ],
+                        'cpu3t': [
+                            "--iree-flow-dispatch-formation-enable-operand-fusion"
+                        ]
+                    })),
         ]),
     ModelBenchmarkInfo(
         name="mobilebert-f16",

diff --git a/iree/compiler/Conversion/Common/Attributes.h b/iree/compiler/Conversion/Common/Attributes.h
deleted file mode 100644
index 9659f3e..0000000
--- a/iree/compiler/Conversion/Common/Attributes.h
+++ /dev/null

@@ -1,31 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef IREE_COMPILER_CONVERSION_LINALGTOSPIRV_ATTRIBUTES_H_
-#define IREE_COMPILER_CONVERSION_LINALGTOSPIRV_ATTRIBUTES_H_
-
-#include "llvm/ADT/StringRef.h"
-
-namespace mlir {
-namespace iree_compiler {
-
-/// Attribute on a module op to denote the scheduling order of entry points.
-/// The attribute value is expected to be an array of entry point name strings.
-inline llvm::StringRef getEntryPointScheduleAttrName() {
-  return "hal.entry_point_schedule";
-}
-}  // namespace iree_compiler
-}  // namespace mlir
-
-#endif  // IREE_COMPILER_CONVERSION_LINALGTOSPIRV_ATTRIBUTES_H_

diff --git a/iree/compiler/Conversion/Common/BUILD b/iree/compiler/Conversion/Common/BUILD
index ce369b4..720c86a 100644
--- a/iree/compiler/Conversion/Common/BUILD
+++ b/iree/compiler/Conversion/Common/BUILD

@@ -32,7 +32,6 @@
         "VectorTransferOptimization.cpp",
     ],
     hdrs = [
-        "Attributes.h",
         "LaunchConfig.h",
         "Passes.h",
         "Transforms.h",

diff --git a/iree/compiler/Conversion/Common/CMakeLists.txt b/iree/compiler/Conversion/Common/CMakeLists.txt
index 4a29284..c77ce0c 100644
--- a/iree/compiler/Conversion/Common/CMakeLists.txt
+++ b/iree/compiler/Conversion/Common/CMakeLists.txt

@@ -14,7 +14,6 @@
   NAME
     Common
   HDRS
-    "Attributes.h"
     "LaunchConfig.h"
     "Passes.h"
     "Transforms.h"

diff --git a/iree/compiler/Conversion/Common/LaunchConfig.cpp b/iree/compiler/Conversion/Common/LaunchConfig.cpp
index 2462574..c17c51f 100644
--- a/iree/compiler/Conversion/Common/LaunchConfig.cpp
+++ b/iree/compiler/Conversion/Common/LaunchConfig.cpp

@@ -24,7 +24,6 @@
 #include "iree/compiler/Conversion/Common/LaunchConfig.h"
 
 #include "iree/compiler/Conversion/CodegenUtils/FunctionUtils.h"
-#include "iree/compiler/Conversion/Common/Attributes.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"

diff --git a/iree/compiler/Conversion/Common/Transforms.cpp b/iree/compiler/Conversion/Common/Transforms.cpp
index 443eef0..a9dc928 100644
--- a/iree/compiler/Conversion/Common/Transforms.cpp
+++ b/iree/compiler/Conversion/Common/Transforms.cpp

@@ -23,7 +23,6 @@
 #include "iree/compiler/Conversion/CodegenUtils/FunctionUtils.h"
 #include "iree/compiler/Conversion/CodegenUtils/MarkerUtils.h"
 #include "iree/compiler/Conversion/CodegenUtils/TransformUtils.h"
-#include "iree/compiler/Conversion/Common/Attributes.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"

diff --git a/iree/compiler/Conversion/LinalgToLLVM/LinalgTileAndDistributePass.cpp b/iree/compiler/Conversion/LinalgToLLVM/LinalgTileAndDistributePass.cpp
index 44e96fd..c43ef16 100644
--- a/iree/compiler/Conversion/LinalgToLLVM/LinalgTileAndDistributePass.cpp
+++ b/iree/compiler/Conversion/LinalgToLLVM/LinalgTileAndDistributePass.cpp

@@ -14,7 +14,6 @@
 
 #include "iree/compiler/Conversion/CodegenUtils/FunctionUtils.h"
 #include "iree/compiler/Conversion/CodegenUtils/MarkerUtils.h"
-#include "iree/compiler/Conversion/Common/Attributes.h"
 #include "iree/compiler/Conversion/Common/Transforms.h"
 #include "iree/compiler/Conversion/LinalgToLLVM/KernelDispatch.h"
 #include "iree/compiler/Conversion/LinalgToLLVM/Passes.h"

diff --git a/iree/compiler/Conversion/LinalgToLLVM/MaterializeCPULaunchConfigurationPass.cpp b/iree/compiler/Conversion/LinalgToLLVM/MaterializeCPULaunchConfigurationPass.cpp
index bd45763..0071519 100644
--- a/iree/compiler/Conversion/LinalgToLLVM/MaterializeCPULaunchConfigurationPass.cpp
+++ b/iree/compiler/Conversion/LinalgToLLVM/MaterializeCPULaunchConfigurationPass.cpp

@@ -14,7 +14,6 @@
 
 #include "iree/compiler/Conversion/CodegenUtils/FunctionUtils.h"
 #include "iree/compiler/Conversion/CodegenUtils/MarkerUtils.h"
-#include "iree/compiler/Conversion/Common/Attributes.h"
 #include "iree/compiler/Conversion/Common/Transforms.h"
 #include "iree/compiler/Conversion/LinalgToLLVM/KernelDispatch.h"
 #include "iree/compiler/Conversion/LinalgToLLVM/Passes.h"

diff --git a/iree/compiler/Conversion/LinalgToLLVM/Passes.cpp b/iree/compiler/Conversion/LinalgToLLVM/Passes.cpp
index 6b2ff1b..4deba72 100644
--- a/iree/compiler/Conversion/LinalgToLLVM/Passes.cpp
+++ b/iree/compiler/Conversion/LinalgToLLVM/Passes.cpp

@@ -14,7 +14,6 @@
 
 #include "iree/compiler/Conversion/Common/Passes.h"
 
-#include "iree/compiler/Conversion/Common/Attributes.h"
 #include "iree/compiler/Conversion/HLOToHLO/Passes.h"
 #include "iree/compiler/Conversion/LinalgToLLVM/Passes.h"
 #include "iree/compiler/Dialect/Shape/Transforms/Passes.h"

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/BUILD b/iree/compiler/Conversion/LinalgToSPIRV/BUILD
index 7cbc3cc..5434383 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/BUILD
+++ b/iree/compiler/Conversion/LinalgToSPIRV/BUILD

@@ -39,10 +39,7 @@
         "ConvertToSPIRVPass.cpp",
         "FoldGPUProcessorIDUses.cpp",
         "KernelDispatchUtils.cpp",
-        "LinalgTileAndDistributePass.cpp",
-        "MatMulVectorizationTest.cpp",
         "Passes.cpp",
-        "SplitDispatchFunctionPass.cpp",
         "TileAndVectorizeInOneWorkgroupPass.cpp",
         "Utils.cpp",
         "VectorToCooperativeMatrixPass.cpp",

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/CMakeLists.txt b/iree/compiler/Conversion/LinalgToSPIRV/CMakeLists.txt
index ea663f5..dded958 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/CMakeLists.txt
+++ b/iree/compiler/Conversion/LinalgToSPIRV/CMakeLists.txt

@@ -36,10 +36,7 @@
     "ConvertToSPIRVPass.cpp"
     "FoldGPUProcessorIDUses.cpp"
     "KernelDispatchUtils.cpp"
-    "LinalgTileAndDistributePass.cpp"
-    "MatMulVectorizationTest.cpp"
     "Passes.cpp"
-    "SplitDispatchFunctionPass.cpp"
     "TileAndVectorizeInOneWorkgroupPass.cpp"
     "Utils.cpp"
     "VectorToCooperativeMatrixPass.cpp"

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/CodeGenOptionUtils.cpp b/iree/compiler/Conversion/LinalgToSPIRV/CodeGenOptionUtils.cpp
index 1e2be3b..b901721 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/CodeGenOptionUtils.cpp
+++ b/iree/compiler/Conversion/LinalgToSPIRV/CodeGenOptionUtils.cpp

@@ -20,12 +20,6 @@
 namespace iree_compiler {
 
 SPIRVCodegenOptions getSPIRVCodegenOptionsFromClOptions() {
-  static llvm::cl::opt<bool> clEnableVectorization(
-      "iree-spirv-enable-vectorization",
-      llvm::cl::desc(
-          "Enable vectorization transformations in SPIR-V code generation"),
-      llvm::cl::init(false));
-
   static llvm::cl::list<unsigned> clWorkgroupTileSizes(
       "iree-spirv-workgroup-tile-size",
       llvm::cl::desc("Set tile sizes to use for each workgroup when tiling "
@@ -48,11 +42,6 @@
       llvm::cl::desc("Set workgroup size to use for SPIR-V code generation"),
       llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated);
 
-  static llvm::cl::opt<bool> clEnableLinalgOnTensorsSPIRV(
-      "iree-codegen-spirv-experimental-linalg-on-tensors",
-      llvm::cl::desc("Enable the linalg on tensors on SPIR-V path"),
-      llvm::cl::init(true));
-
   SPIRVCodegenOptions options;
   options.workgroupSize.assign(clWorkgroupSizes.begin(),
                                clWorkgroupSizes.end());
@@ -60,10 +49,7 @@
                                     clWorkgroupTileSizes.end());
   options.invocationTileSizes.assign(clInvocationTileSizes.begin(),
                                      clInvocationTileSizes.end());
-  options.enableVectorization =
-      clEnableLinalgOnTensorsSPIRV || clEnableVectorization;
   options.useWorkgroupMemory = clUseWorkgroupMemory;
-  options.usingLinalgOnTensors = clEnableLinalgOnTensorsSPIRV;
   return options;
 }
 

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/CodeGenOptionUtils.h b/iree/compiler/Conversion/LinalgToSPIRV/CodeGenOptionUtils.h
index c19220f..40df44c 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/CodeGenOptionUtils.h
+++ b/iree/compiler/Conversion/LinalgToSPIRV/CodeGenOptionUtils.h

@@ -37,9 +37,7 @@
   llvm::SmallVector<unsigned, 3> workgroupTileSizes = {};
   llvm::SmallVector<unsigned, 3> invocationTileSizes = {};
 
-  bool enableVectorization = false;
   bool useWorkgroupMemory = false;
-  bool usingLinalgOnTensors = true;
 };
 
 // Returns SPIR-V CodeGen options from command-line options.

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/ConcretizeTileAmongWorkgroupsPass.cpp b/iree/compiler/Conversion/LinalgToSPIRV/ConcretizeTileAmongWorkgroupsPass.cpp
index ec5658e..2c5fbed 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/ConcretizeTileAmongWorkgroupsPass.cpp
+++ b/iree/compiler/Conversion/LinalgToSPIRV/ConcretizeTileAmongWorkgroupsPass.cpp

@@ -157,20 +157,6 @@
   linalgOps.assign(ops.begin(), ops.end());
   linalg::LinalgDependenceGraph dependenceGraph(aliases, linalgOps);
 
-  // NOTE: Launch configuration expects the original input/output type to decide
-  // the configuration. But we have already tiled the Linalg ops here. Use an
-  // attribute to send it over for now.
-  const char inputTypeAttrName[] = "iree.codegen.original_input_types";
-  const char outputTypeAttrName[] = "iree.codegen.original_output_types";
-  if (!inputTypes.empty()) {
-    rootOp->setAttr(inputTypeAttrName,
-                    Builder(rootOp).getTypeArrayAttr(inputTypes));
-  }
-  if (!outputTypes.empty()) {
-    rootOp->setAttr(outputTypeAttrName,
-                    Builder(rootOp).getTypeArrayAttr(outputTypes));
-  }
-
   Optional<LaunchConfig> launchConfig = initGPULaunchConfig(
       rootOp->getContext(), dependenceGraph, options, linalgOps);
   if (!launchConfig) {

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/ConvertToGPUPass.cpp b/iree/compiler/Conversion/LinalgToSPIRV/ConvertToGPUPass.cpp
index 4726e36..ef56ea1 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/ConvertToGPUPass.cpp
+++ b/iree/compiler/Conversion/LinalgToSPIRV/ConvertToGPUPass.cpp

@@ -23,7 +23,6 @@
 
 #include "iree/compiler/Conversion/CodegenUtils/FunctionUtils.h"
 #include "iree/compiler/Conversion/CodegenUtils/MarkerUtils.h"
-#include "iree/compiler/Conversion/Common/Attributes.h"
 #include "iree/compiler/Conversion/Common/Transforms.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/KernelDispatchUtils.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/MemorySpace.h"
@@ -439,16 +438,10 @@
 }
 
 /// Distributes scf.parallel to workitems using local invocation ID.
-static LogicalResult mapToLocalInvocationId(
-    ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp,
-    bool useCyclicDistribution = false) {
-  if (useCyclicDistribution) {
-    return distributeCyclicallyToProcessors<gpu::ThreadIdOp, gpu::BlockDimOp>(
-        rewriter, pLoopOp);
-  }
-  return distributeSingleIterationPerProcessor<gpu::ThreadIdOp,
-                                               gpu::BlockDimOp>(rewriter,
-                                                                pLoopOp);
+static LogicalResult mapToLocalInvocationId(ConversionPatternRewriter &rewriter,
+                                            scf::ParallelOp pLoopOp) {
+  return distributeCyclicallyToProcessors<gpu::ThreadIdOp, gpu::BlockDimOp>(
+      rewriter, pLoopOp);
 }
 
 /// Distributes scf.parallel to workitems using global invocation ID. The GPU
@@ -499,22 +492,14 @@
 
 namespace {
 /// Pass to convert from tiled and fused linalg ops into gpu.func.
-class ConvertToGPUPass
+struct ConvertToGPUPass
     : public PassWrapper<ConvertToGPUPass,
                          OperationPass<IREE::HAL::ExecutableTargetOp>> {
- public:
-  ConvertToGPUPass(const SPIRVCodegenOptions &passOptions)
-      : options(passOptions) {}
-  ConvertToGPUPass(const ConvertToGPUPass &pass) : options(pass.options) {}
-
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<AffineDialect, gpu::GPUDialect, scf::SCFDialect,
                     ShapeDialect>();
   }
   void runOnOperation() override;
-
- private:
-  SPIRVCodegenOptions options;
 };
 
 struct SerializeParallelLoopPattern
@@ -532,7 +517,7 @@
 template <typename LinalgOpTy>
 static LogicalResult mapLinalgOpToLocalInvocationIdImpl(
     LinalgOpTy linalgOp, ArrayRef<Value> operands,
-    ConversionPatternRewriter &rewriter, bool optimizeControlFlow) {
+    ConversionPatternRewriter &rewriter) {
   // Check for marker that specifies that the linalg op is to be partitioned
   // across threads within a workgroup.
   if (!hasMarker(linalgOp)) return failure();
@@ -542,7 +527,7 @@
   if (loops.getValue().empty()) return success();
 
   auto pLoopOp = cast<scf::ParallelOp>(loops.getValue()[0]);
-  return mapToLocalInvocationId(rewriter, pLoopOp, optimizeControlFlow);
+  return mapToLocalInvocationId(rewriter, pLoopOp);
 }
 
 static LogicalResult distributeCopyOp(linalg::CopyOp copyOp,
@@ -580,7 +565,7 @@
 template <>
 LogicalResult mapLinalgOpToLocalInvocationIdImpl<linalg::CopyOp>(
     linalg::CopyOp copyOp, ArrayRef<Value> operands,
-    ConversionPatternRewriter &rewriter, bool optimizeControlFlow) {
+    ConversionPatternRewriter &rewriter) {
   if (!hasMarker(copyOp,
                  {getCopyToWorkgroupMemoryMarker(), getWorkgroupMarker()}))
     return failure();
@@ -591,7 +576,7 @@
 
   auto pLoopOp = cast<scf::ParallelOp>(loops.getValue()[0]);
   if (hasMarker(copyOp, getWorkgroupMarker())) {
-    return mapToLocalInvocationId(rewriter, pLoopOp, optimizeControlFlow);
+    return mapToLocalInvocationId(rewriter, pLoopOp);
   }
   return distributeCopyOp(copyOp, pLoopOp, rewriter);
 }
@@ -601,15 +586,14 @@
 template <typename LinalgOpTy>
 struct MapLinalgOpToLocalInvocationId : public OpConversionPattern<LinalgOpTy> {
   MapLinalgOpToLocalInvocationId(MLIRContext *context,
-                                 bool usingLinalgOnTensorsPath,
                                  PatternBenefit benefit = 1)
-      : OpConversionPattern<LinalgOpTy>(context, benefit),
-        usingLinalgOnTensorsPath(usingLinalgOnTensorsPath) {}
+      : OpConversionPattern<LinalgOpTy>(context, benefit) {}
+
   LogicalResult matchAndRewrite(
       LinalgOpTy linalgOp, ArrayRef<Value> operands,
       ConversionPatternRewriter &rewriter) const override {
-    if (failed(mapLinalgOpToLocalInvocationIdImpl(linalgOp, operands, rewriter,
-                                                  usingLinalgOnTensorsPath)))
+    if (failed(
+            mapLinalgOpToLocalInvocationIdImpl(linalgOp, operands, rewriter)))
       return failure();
 
     // If the `linalgOp` writes to workgroup memory insert barrier after the
@@ -626,13 +610,6 @@
     rewriter.eraseOp(linalgOp);
     return success();
   }
-
- private:
-  /// Flag to signify if Linalg on tensors path is being used. The control flow
-  /// optimizations implemented on legacy path seems to be failing on this
-  /// path. Assuming this overhead is not too much, for now just generated the
-  /// extra loops.
-  bool usingLinalgOnTensorsPath;
 };
 
 /// Given the workload return the workgroup count along X obtained by
@@ -655,10 +632,8 @@
 struct MapLinalgOpToGlobalInvocationId
     : public OpConversionPattern<LinalgOpTy> {
   MapLinalgOpToGlobalInvocationId(MLIRContext *context,
-                                  bool usingLinalgOnTensorsPath,
                                   PatternBenefit benefit = 1)
-      : OpConversionPattern<LinalgOpTy>(context, benefit),
-        usingLinalgOnTensorsPath(usingLinalgOnTensorsPath) {}
+      : OpConversionPattern<LinalgOpTy>(context, benefit) {}
 
   LogicalResult matchAndRewrite(
       LinalgOpTy linalgOp, ArrayRef<Value> operands,
@@ -685,46 +660,15 @@
         workgroupSize = {32, 1, 1};
       }
     }
-    if (usingLinalgOnTensorsPath) {
-      WorkgroupCountRegionBuilder regionBuilder =
-          [&workgroupSize](
-              OpBuilder &b, Location loc,
-              std::array<Value, 3> workload) -> std::array<Value, 3> {
-        Value one = b.create<ConstantIndexOp>(loc, 1);
-        return {getWorkgroupCountX(b, loc, workload, workgroupSize[0]), one,
-                one};
-      };
-      if (failed(defineWorkgroupCountRegion(rewriter, funcOp, regionBuilder))) {
-        return failure();
-      }
-    } else {
-      // TODO (GH-4901): Only support static shapes on this path. This should be
-      // removed when moved to linalg on tensors.
-      Optional<SmallVector<int64_t, 4>> staticLoopRange =
-          linalgOp.getStaticLoopRanges();
-      if (!staticLoopRange ||
-          llvm::any_of(staticLoopRange.getValue(), [](int64_t d) {
-            return d == ShapedType::kDynamicSize;
-          })) {
-        return linalgOp.emitError("failed to find statlc loop bounds");
-      }
-      ArrayRef<int64_t> parallelLoopRange(staticLoopRange.getValue());
-      unsigned numOuterParallel = getNumOuterParallelLoops(linalgOp);
-      parallelLoopRange = parallelLoopRange.take_front(numOuterParallel);
-      WorkgroupCountRegionBuilder regionBuilder =
-          [&parallelLoopRange, &workgroupSize](
-              OpBuilder &b, Location loc,
-              std::array<Value, 3> workload) -> std::array<Value, 3> {
-        Value one = b.create<ConstantIndexOp>(loc, 1);
-        auto values = llvm::to_vector<4>(
-            llvm::map_range(parallelLoopRange, [&](int64_t dim) -> Value {
-              return b.create<ConstantIndexOp>(loc, dim);
-            }));
-        return {getWorkgroupCountX(b, loc, values, workgroupSize[0]), one, one};
-      };
-      if (failed(defineWorkgroupCountRegion(rewriter, funcOp, regionBuilder))) {
-        return failure();
-      }
+    WorkgroupCountRegionBuilder regionBuilder =
+        [&workgroupSize](OpBuilder &b, Location loc,
+                         std::array<Value, 3> workload) {
+          Value one = b.create<ConstantIndexOp>(loc, 1);
+          return std::array<Value, 3>{
+              getWorkgroupCountX(b, loc, workload, workgroupSize[0]), one, one};
+        };
+    if (failed(defineWorkgroupCountRegion(rewriter, funcOp, regionBuilder))) {
+      return failure();
     }
     if (failed(updateWorkGroupSize(funcOp, workgroupSize))) {
       return failure();
@@ -732,13 +676,6 @@
     rewriter.eraseOp(linalgOp);
     return success();
   }
-
- private:
-  /// Flag to signify if Linalg on tensors path is being used. This changes the
-  /// way the number of workgroups is computed. With the linalg on tensors path,
-  /// the hal.executable.entry_point will be updated to contain a region that
-  /// gives the number of workgroups to use.
-  bool usingLinalgOnTensorsPath;
 };
 
 /// Remove the linalg.range operation created when lowering to loops.
@@ -843,8 +780,7 @@
       MapLinalgOpToLocalInvocationId<linalg::PoolingNHWCMaxFOp>,
       MapLinalgOpToLocalInvocationId<linalg::PoolingNHWCMinFOp>,
       MapLinalgOpToLocalInvocationId<linalg::PoolingNHWCSumFOp>,
-      RemoveLinalgRange, SerializeParallelLoopPattern>(
-      context, options.usingLinalgOnTensors);
+      RemoveLinalgRange, SerializeParallelLoopPattern>(context);
   FrozenRewritePatternSet frozenPatterns(std::move(patterns));
 
   for (FuncOp funcOp : getOperation().getInnerModule().getOps<FuncOp>()) {
@@ -860,15 +796,13 @@
 }
 
 std::unique_ptr<OperationPass<IREE::HAL::ExecutableTargetOp>>
-createConvertToGPUPass(const SPIRVCodegenOptions &options) {
-  return std::make_unique<ConvertToGPUPass>(options);
+createConvertToGPUPass() {
+  return std::make_unique<ConvertToGPUPass>();
 }
 
 static PassRegistration<ConvertToGPUPass> pass(
-    "iree-codegen-convert-to-gpu", "Map tiled linalg and loop ops to GPU", [] {
-      SPIRVCodegenOptions options = getSPIRVCodegenOptionsFromClOptions();
-      return std::make_unique<ConvertToGPUPass>(options);
-    });
+    "iree-codegen-convert-to-gpu", "Map tiled linalg and loop ops to GPU",
+    [] { return std::make_unique<ConvertToGPUPass>(); });
 
 }  // namespace iree_compiler
 }  // namespace mlir

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/FoldGPUProcessorIDUses.cpp b/iree/compiler/Conversion/LinalgToSPIRV/FoldGPUProcessorIDUses.cpp
index 1e35b13..53de3b4 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/FoldGPUProcessorIDUses.cpp
+++ b/iree/compiler/Conversion/LinalgToSPIRV/FoldGPUProcessorIDUses.cpp

@@ -18,7 +18,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "iree/compiler/Conversion/Common/Attributes.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/Passes.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "llvm/ADT/STLExtras.h"

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/KernelDispatchUtils.cpp b/iree/compiler/Conversion/LinalgToSPIRV/KernelDispatchUtils.cpp
index 90b1619..c8e3ae5 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/KernelDispatchUtils.cpp
+++ b/iree/compiler/Conversion/LinalgToSPIRV/KernelDispatchUtils.cpp

@@ -24,7 +24,6 @@
 #include "iree/compiler/Conversion/LinalgToSPIRV/KernelDispatchUtils.h"
 
 #include "iree/compiler/Conversion/CodegenUtils/FunctionUtils.h"
-#include "iree/compiler/Conversion/Common/Attributes.h"
 #include "iree/compiler/Conversion/Common/LaunchConfig.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/Passes.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/Utils.h"
@@ -203,8 +202,7 @@
                                 const SPIRVCodegenOptions &options,
                                 TileSizesListType &tileSizes,
                                 LaunchConfigInfo &config) {
-  if (options.enableVectorization &&
-      succeeded(getMaliSpecificConfig(op, targetEnv, options, tileSizes,
+  if (succeeded(getMaliSpecificConfig(op, targetEnv, options, tileSizes,
                                       config.workgroupSize,
                                       config.numSubgroups))) {
     config.vectorize = true;
@@ -337,10 +335,9 @@
   // transfer_read ops with permutation maps that we currently cannot lower.
   // TODO: Remove this restriction once the lowering of the permutation map is
   // supported in core.
-  bool vectorize = options.enableVectorization &&
-                   llvm::all_of(linalgOp.getIndexingMaps(), [](AffineMap &map) {
-                     return map.isMinorIdentity();
-                   });
+  bool vectorize = llvm::all_of(linalgOp.getIndexingMaps(), [](AffineMap &map) {
+    return map.isMinorIdentity();
+  });
   int64_t subgroupSize =
       targetEnv.getResourceLimits().subgroup_size().getValue().getSExtValue();
   config.workgroupSize[0] = subgroupSize;
@@ -455,15 +452,13 @@
                                 const SPIRVCodegenOptions &options,
                                 TileSizesListType &tileSizes,
                                 LaunchConfigInfo &config) {
-  if (options.enableVectorization &&
-      succeeded(getConfigForCooperativeMatmul(op, targetEnv, options, tileSizes,
+  if (succeeded(getConfigForCooperativeMatmul(op, targetEnv, options, tileSizes,
                                               config.workgroupSize,
                                               config.numSubgroups))) {
     config.vectorize = true;
     return success();
   }
-  if (options.enableVectorization &&
-      succeeded(getTargetSpecificConfig(op, targetEnv, options, tileSizes,
+  if (succeeded(getTargetSpecificConfig(op, targetEnv, options, tileSizes,
                                         config.workgroupSize,
                                         config.numSubgroups))) {
     config.vectorize = true;
@@ -575,8 +570,7 @@
                                     const SPIRVCodegenOptions &options,
                                     TileSizesListType &tileSizes,
                                     LaunchConfigInfo &config) {
-  if (options.enableVectorization &&
-      targetEnv.getVendorID() == spirv::Vendor::ARM &&
+  if (targetEnv.getVendorID() == spirv::Vendor::ARM &&
       succeeded(getMaliSpecificConfig(op, tileSizes, config))) {
     return success();
   }
@@ -587,11 +581,7 @@
   const int64_t tileSizeX = 32;
   int64_t tileSizeY = maxWorkgroupSize / tileSizeX;
   SmallVector<int64_t, 4> ts;
-  if (options.usingLinalgOnTensors) {
-    ts.assign({0, 1, tileSizeY, tileSizeX});
-  } else {
-    ts.assign({1, tileSizeY, tileSizeX});
-  }
+  ts.assign({0, 1, tileSizeY, tileSizeX});
   tileSizes.emplace_back(std::move(ts));
   config.workgroupSize = {tileSizeX, tileSizeY, 1};
   return success();
@@ -689,11 +679,7 @@
   const int64_t tileSizeX = 32;
   int64_t tileSizeY = maxWorkgroupSize / tileSizeX;
   SmallVector<int64_t, 4> ts;
-  if (options.usingLinalgOnTensors) {
-    ts.assign({0, 1, tileSizeY, tileSizeX});
-  } else {
-    ts.assign({1, tileSizeY, tileSizeX});
-  }
+  ts.assign({0, 1, tileSizeY, tileSizeX});
   tileSizes.emplace_back(std::move(ts));
   config.workgroupSize = {tileSizeX, tileSizeY, 1};
   return success();
@@ -711,12 +697,8 @@
   const int64_t tileSizeX = 32;
   int64_t tileSizeY = maxWorkgroupSize / tileSizeX;
   SmallVector<int64_t, 4> ts;
-  if (options.usingLinalgOnTensors) {
-    // There are five parallel loops in depthwise_conv_2d_input_nhwc_filter_hwcf
-    ts.assign({0, 0, 1, tileSizeY, tileSizeX});
-  } else {
-    ts.assign({1, tileSizeY, tileSizeX});
-  }
+  // There are five parallel loops in depthwise_conv_2d_input_nhwc_filter_hwcf
+  ts.assign({0, 0, 1, tileSizeY, tileSizeX});
   tileSizes.emplace_back(std::move(ts));
   config.workgroupSize = {tileSizeX, tileSizeY, 1};
   return success();
@@ -737,11 +719,7 @@
   const int64_t tileSizeX = 32;
   int64_t tileSizeY = maxWorkgroupSize / tileSizeX;
   SmallVector<int64_t, 4> ts;
-  if (options.usingLinalgOnTensors) {
-    ts.assign({0, tileSizeY, tileSizeX, 1});
-  } else {
-    ts.assign({0, tileSizeY, tileSizeX});
-  }
+  ts.assign({0, tileSizeY, tileSizeX, 1});
   tileSizes.emplace_back(std::move(ts));
   config.workgroupSize = {tileSizeX, tileSizeY, 1};
   return success();
@@ -782,7 +760,7 @@
       // Invocation level.
       launchConfig.setTileSizes(linalgOp.getOperation(), invocationTileSizes,
                                 2);
-      launchConfig.setVectorize(options.enableVectorization);
+      launchConfig.setVectorize(true);
     }
     SmallVector<int64_t, 3> workgroupSize(options.workgroupSize.begin(),
                                           options.workgroupSize.end());

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/LinalgTileAndDistributePass.cpp b/iree/compiler/Conversion/LinalgToSPIRV/LinalgTileAndDistributePass.cpp
deleted file mode 100644
index ba1333b..0000000
--- a/iree/compiler/Conversion/LinalgToSPIRV/LinalgTileAndDistributePass.cpp
+++ /dev/null

@@ -1,155 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//===- LinalgTileAndDistributePass.cpp ------------------------------------===//
-//
-// This pass tiles and distributes linalg operations among multiple workgroups.
-//
-// NOTE: Deprecated. This pass is used for the first-level tiling in the Linalg
-// on buffers path, which is expected to go away soon.
-//
-//===----------------------------------------------------------------------===//
-
-#include "iree/compiler/Conversion/CodegenUtils/FunctionUtils.h"
-#include "iree/compiler/Conversion/CodegenUtils/MarkerUtils.h"
-#include "iree/compiler/Conversion/Common/Attributes.h"
-#include "iree/compiler/Conversion/Common/Transforms.h"
-#include "iree/compiler/Conversion/LinalgToSPIRV/CodeGenOptionUtils.h"
-#include "iree/compiler/Conversion/LinalgToSPIRV/KernelDispatchUtils.h"
-#include "iree/compiler/Conversion/LinalgToSPIRV/Utils.h"
-#include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
-#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
-#include "mlir/Dialect/GPU/GPUDialect.h"
-#include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-#define DEBUG_TYPE "iree-linalg-to-spirv-tile-and-distribute"
-
-namespace mlir {
-namespace iree_compiler {
-namespace {
-
-/// Returns the distribution options for operations when targeting workgroups.
-linalg::LinalgLoopDistributionOptions getWorkgroupDistributionOptions() {
-  linalg::LinalgLoopDistributionOptions options;
-
-  options.procInfo = [](OpBuilder &builder, Location loc,
-                        ArrayRef<Range> parallelLoopRanges) {
-    return getGPUProcessorIdsAndCounts<gpu::BlockIdOp, gpu::GridDimOp>(
-        builder, loc, parallelLoopRanges.size());
-  };
-  options.distributionMethod.assign(
-      3, linalg::DistributionMethod::CyclicNumProcsEqNumIters);
-
-  return options;
-}
-
-class LinalgTileAndDistributePass
-    : public PassWrapper<LinalgTileAndDistributePass,
-                         OperationPass<IREE::HAL::ExecutableTargetOp>> {
- public:
-  LinalgTileAndDistributePass(const SPIRVCodegenOptions &options)
-      : options(options) {}
-  LinalgTileAndDistributePass(const LinalgTileAndDistributePass &that)
-      : options(that.options) {}
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<AffineDialect, IREE::HAL::HALDialect, linalg::LinalgDialect,
-                    scf::SCFDialect>();
-  }
-
-  void runOnOperation() override {
-    MLIRContext *context = &getContext();
-    IREE::HAL::ExecutableTargetOp targetOp = getOperation();
-    ModuleOp module = targetOp.getInnerModule();
-
-    for (FuncOp funcOp : module.getOps<FuncOp>()) {
-      if (!isEntryPoint(funcOp)) continue;
-
-      SmallVector<linalg::LinalgOp, 4> linalgOps;
-      SmallVector<Operation *, 4> tiledLoops;
-
-      if (failed(getLinalgOps(funcOp, linalgOps, tiledLoops))) {
-        // If there are no linalg ops, nothing to do here.
-        continue;
-      }
-
-      linalg::Aliases aliases;
-      linalg::LinalgDependenceGraph dependenceGraph(aliases, linalgOps);
-      Optional<LaunchConfig> launchConfigOpt =
-          initGPULaunchConfig(context, dependenceGraph, options, linalgOps);
-      if (!launchConfigOpt) {
-        // Having no launch configuration also means nothing to do here.
-        continue;
-      }
-      LaunchConfig &launchConfig = *launchConfigOpt;
-
-      LLVM_DEBUG({
-        llvm::dbgs()
-            << "\n--- IREE Linalg tile and distribute configuration ---\n";
-        llvm::dbgs() << "@func " << funcOp.getName()
-                     << ": # workgroup sizes: [";
-        interleaveComma(launchConfig.getWorkgroupSize(), llvm::dbgs());
-        llvm::dbgs() << "]\n";
-        for (auto op : linalgOps) {
-          llvm::dbgs() << "\t" << op.getOperation()->getName() << " : ";
-          TileSizesListTypeRef tileSizes = launchConfig.getTileSizes(op);
-          llvm::dbgs() << "{";
-          std::string sep = "";
-          for (auto &level : enumerate(tileSizes)) {
-            llvm::dbgs() << sep << level.index() << " : [";
-            sep = ", ";
-            interleaveComma(level.value(), llvm::dbgs());
-            llvm::dbgs() << "]";
-          }
-          llvm::dbgs() << "}\n";
-        }
-      });
-      TileAndFuseOptions tileAndFuseOptions = {
-          getWorkgroupDistributionOptions(), allocateWorkgroupMemory};
-      if (failed(tileAndFuseLinalgBufferOps(funcOp, linalgOps, dependenceGraph,
-                                            launchConfig,
-                                            tileAndFuseOptions)) ||
-          failed(
-              updateWorkGroupSize(funcOp, launchConfig.getWorkgroupSize()))) {
-        return signalPassFailure();
-      }
-    }
-  }
-
- private:
-  SPIRVCodegenOptions options;
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableTargetOp>>
-createTileAndDistributeAmongWorkgroupsPass(const SPIRVCodegenOptions &options) {
-  return std::make_unique<LinalgTileAndDistributePass>(options);
-}
-
-static PassRegistration<LinalgTileAndDistributePass> pass(
-    "iree-codegen-spirv-linalg-tile-and-distribute",
-    "Tile and distribute Linalg operations on buffers", [] {
-      SPIRVCodegenOptions options = getSPIRVCodegenOptionsFromClOptions();
-      return std::make_unique<LinalgTileAndDistributePass>(options);
-    });
-
-}  // namespace iree_compiler
-}  // namespace mlir

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/MatMulVectorizationTest.cpp b/iree/compiler/Conversion/LinalgToSPIRV/MatMulVectorizationTest.cpp
deleted file mode 100644
index 295a48e..0000000
--- a/iree/compiler/Conversion/LinalgToSPIRV/MatMulVectorizationTest.cpp
+++ /dev/null

@@ -1,76 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassRegistry.h"
-
-static llvm::cl::opt<int> wgTileSize(
-    "iree-codegen-linalg-to-gpu-wg-tile-size",
-    llvm::cl::desc(
-        "Specify the size of workgroup tile for matmul vector lowering"),
-    llvm::cl::init(32));
-
-static llvm::cl::list<uint32_t> unrollSize(
-    "iree-codegen-linalg-to-gpu-unroll-size",
-    llvm::cl::desc("Specify the size of the "), llvm::cl::CommaSeparated);
-
-static llvm::cl::opt<bool> enableLICM(
-    "iree-codegen-linalg-to-gpu-matmul-licm",
-    llvm::cl::desc(
-        "If true run LICM and hoisting passes after the staged transforms"),
-    llvm::cl::init(true));
-
-namespace mlir {
-namespace iree_compiler {
-
-namespace {
-struct MatMulTileAndVectorizeGPUPass
-    : PassWrapper<MatMulTileAndVectorizeGPUPass, FunctionPass> {
-  void runOnFunction() override;
-};
-}  // namespace
-
-void MatMulTileAndVectorizeGPUPass::runOnFunction() {
-  FuncOp fn = getFunction();
-  SmallVector<uint32_t, 3> vUnrollSize(unrollSize.begin(), unrollSize.end());
-  if (vUnrollSize.size() != 3) signalPassFailure();
-  linalg::CodegenStrategy strategy;
-  strategy
-      .tile<linalg::MatmulOp>(
-          linalg::LinalgTilingOptions()
-              // TODO(thomasraoux): Enable parallel loops once affine.min
-              // canonicalize supports it.
-              //.setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
-              .setTileSizes({wgTileSize, wgTileSize, wgTileSize}))
-      .setEnableLICM(enableLICM)
-      .vectorize<linalg::MatmulOp>()
-      // TODO upstream to the core CodegenStrategy
-      // .unrollVector<vector::ContractionOp>(
-      //     {vUnrollSize[0], vUnrollSize[1], vUnrollSize[2]})
-      ;
-  strategy.transform(fn);
-}
-
-std::unique_ptr<FunctionPass> createMatMulTileAndVectorizeGPUPass() {
-  return std::make_unique<MatMulTileAndVectorizeGPUPass>();
-}
-
-static PassRegistration<MatMulTileAndVectorizeGPUPass> pass(
-    "iree-codegen-linalg-to-gpu-matmul-vectorization-pass",
-    "Tile and vectorize linalg.matmul operation",
-    [] { return std::make_unique<MatMulTileAndVectorizeGPUPass>(); });
-
-}  // namespace iree_compiler
-}  // namespace mlir

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/Passes.cpp b/iree/compiler/Conversion/LinalgToSPIRV/Passes.cpp
index 72aa4ae..ae2b6d6 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/Passes.cpp
+++ b/iree/compiler/Conversion/LinalgToSPIRV/Passes.cpp

@@ -83,14 +83,10 @@
   //     - The Linalg op is kept untouched.
   //
   //===--------------------------------------------------------------------===//
-  if (options.usingLinalgOnTensors) {
-    // flow.dispatch.workgroups performed abstract tiling and distribution. Make
-    // them concrete now since we know the target and settings now.
-    pm.addPass(createConcretizeTileAmongWorkgroupsPass(options));
-  } else {
-    pm.addPass(createSplitDispatchFunctionPass());
-    pm.addPass(createTileAndDistributeAmongWorkgroupsPass(options));
-  }
+
+  // flow.dispatch.workgroups performed abstract tiling and distribution. Make
+  // them concrete now since we know the target and settings now.
+  pm.addPass(createConcretizeTileAmongWorkgroupsPass(options));
 
   pm.addPass(createTileAndVectorizeInOneWorkgroupPass(options));
   pm.nest<ModuleOp>().addPass(createCanonicalizerPass());
@@ -103,10 +99,8 @@
   //     workgroups.
   //   - Linalg ops are converted to loop.for ops and mapped to workitems.
   //===--------------------------------------------------------------------===//
-  pm.addPass(createConvertToGPUPass(options));
-  if (options.enableVectorization) {
-    pm.nest<ModuleOp>().addNestedPass<FuncOp>(createVectorToGPUPass());
-  }
+  pm.addPass(createConvertToGPUPass());
+  pm.nest<ModuleOp>().addNestedPass<FuncOp>(createVectorToGPUPass());
   pm.nest<ModuleOp>().addPass(createLowerAffinePass());
   pm.nest<ModuleOp>().addPass(createCanonicalizerPass());
   pm.nest<ModuleOp>().addPass(createCSEPass());
@@ -120,29 +114,22 @@
   //   - Load/store on std.subview ops are converted into load/store on the
   //     original buffers.
   //===--------------------------------------------------------------------===//
-  if (options.enableVectorization) {
-    pm.nest<ModuleOp>().addNestedPass<FuncOp>(
-        createVectorTransferOptimizationPass());
-  }
+  pm.nest<ModuleOp>().addNestedPass<FuncOp>(
+      createVectorTransferOptimizationPass());
   pm.nest<ModuleOp>().addPass(memref::createFoldSubViewOpsPass());
   pm.nest<ModuleOp>().addPass(createCanonicalizerPass());
   pm.nest<ModuleOp>().addPass(createCSEPass());
-  if (options.enableVectorization) {
-    pm.nest<ModuleOp>().addPass(createVectorizeMemrefLoadStorePass());
-    pm.nest<ModuleOp>().addNestedPass<FuncOp>(
-        createConvertVectorToCooperativeMatrixPass());
-    pm.nest<ModuleOp>().addNestedPass<FuncOp>(
-        createForOpCanonicalizationPass());
-    pm.nest<ModuleOp>().addPass(createCanonicalizerPass());
-    pm.nest<ModuleOp>().addPass(createCSEPass());
-  }
+  pm.nest<ModuleOp>().addPass(createVectorizeMemrefLoadStorePass());
+  pm.nest<ModuleOp>().addNestedPass<FuncOp>(
+      createConvertVectorToCooperativeMatrixPass());
+  pm.nest<ModuleOp>().addNestedPass<FuncOp>(createForOpCanonicalizationPass());
+  pm.nest<ModuleOp>().addPass(createCanonicalizerPass());
+  pm.nest<ModuleOp>().addPass(createCSEPass());
 
-  if (options.usingLinalgOnTensors) {
-    pm.nest<ModuleOp>().addNestedPass<FuncOp>(createFlattenMemRefSubspanPass());
-    pm.nest<ModuleOp>().addPass(createLowerAffinePass());
-    pm.nest<ModuleOp>().addPass(createCanonicalizerPass());
-    pm.nest<ModuleOp>().addPass(createCSEPass());
-  }
+  pm.nest<ModuleOp>().addNestedPass<FuncOp>(createFlattenMemRefSubspanPass());
+  pm.nest<ModuleOp>().addPass(createLowerAffinePass());
+  pm.nest<ModuleOp>().addPass(createCanonicalizerPass());
+  pm.nest<ModuleOp>().addPass(createCSEPass());
 
   //===--------------------------------------------------------------------===//
   // Final conversion to SPIR-V dialect.

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/Passes.h b/iree/compiler/Conversion/LinalgToSPIRV/Passes.h
index 7214ed8..82e5fca 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/Passes.h
+++ b/iree/compiler/Conversion/LinalgToSPIRV/Passes.h

@@ -37,7 +37,7 @@
 /// Pass to add the synchronizations and attributes needed to lower from PLoops
 /// to GPU dialect.
 std::unique_ptr<OperationPass<IREE::HAL::ExecutableTargetOp>>
-createConvertToGPUPass(const SPIRVCodegenOptions &options);
+createConvertToGPUPass();
 
 /// Pass to perform the final conversion to SPIR-V dialect.
 /// This pass converts remaining interface ops into SPIR-V global variables,
@@ -45,16 +45,6 @@
 /// corresponding SPIR-V ops.
 std::unique_ptr<OperationPass<ModuleOp>> createConvertToSPIRVPass();
 
-/// Pass to split computation workload to multiple sequential dispatch
-/// functions. This pass operates on Linalg ops and prepares for lowering to
-/// GPU, where we need to tile the workload to workgroups and workitems. If the
-/// workload involves computation A and B, where B is dependent on A and A needs
-/// all workgroups to complete, then we need to split A and B into different
-/// kernels because there is no mechanism to perform cross-workgroup
-/// synchronization within a single kernel.
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableTargetOp>>
-createSplitDispatchFunctionPass();
-
 /// Pass to convert vector operations to GPU level operations. Instructions of
 /// vector size equal to subgroup size are distributed across the subgroup.
 std::unique_ptr<OperationPass<FuncOp>> createVectorToGPUPass();
@@ -63,9 +53,6 @@
 /// cooperative matrix ops when possible.
 std::unique_ptr<FunctionPass> createConvertVectorToCooperativeMatrixPass();
 
-/// Pass to apply tiling and vectorization transformations on linagl::MatMulOp.
-std::unique_ptr<FunctionPass> createMatMulTileAndVectorizeGPUPass();
-
 /// Converts memref of scalar to memref of vector of efficent size. This will
 /// allow to convert memory accesses to vector load/store in SPIR-V without
 /// having pointer bitcast.
@@ -86,11 +73,6 @@
 std::unique_ptr<OperationPass<IREE::HAL::ExecutableTargetOp>>
 createConcretizeTileAmongWorkgroupsPass(const SPIRVCodegenOptions &options);
 
-/// Tiles and distributes Linalg operations on buffers among multiple
-/// workgroups.
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableTargetOp>>
-createTileAndDistributeAmongWorkgroupsPass(const SPIRVCodegenOptions &options);
-
 //===----------------------------------------------------------------------===//
 // Pipelines
 //===----------------------------------------------------------------------===//

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/SplitDispatchFunctionPass.cpp b/iree/compiler/Conversion/LinalgToSPIRV/SplitDispatchFunctionPass.cpp
deleted file mode 100644
index e612d67..0000000
--- a/iree/compiler/Conversion/LinalgToSPIRV/SplitDispatchFunctionPass.cpp
+++ /dev/null

@@ -1,330 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//===- SplitDispathFunctionPass.cpp ---------------------------------------===//
-//
-// This file implements a pass to split computation workload to multiple
-// sequential dispatch functions. This pass operates on Linalg ops and
-// scf.parallel op and prepares for lowering to GPU, where we need to tile the
-// workload to workgroups and workitems. If the workload involves computation A
-// and B, where B is dependent on A and A needs all workgroups to complete, then
-// we need to split A and B into different kernels because there is no mechanism
-// to perform cross-workgroup synchronization within a single kernel.
-//
-//===----------------------------------------------------------------------===//
-
-#include <iterator>
-
-#include "iree/compiler/Conversion/CodegenUtils/FunctionUtils.h"
-#include "iree/compiler/Conversion/Common/Attributes.h"
-#include "iree/compiler/Conversion/LinalgToSPIRV/KernelDispatchUtils.h"
-#include "iree/compiler/Conversion/LinalgToSPIRV/Passes.h"
-#include "iree/compiler/Dialect/IREE/IR/IREEOps.h"
-#include "iree/compiler/Dialect/Shape/IR/ShapeOps.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
-#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
-#include "mlir/Dialect/Linalg/Utils/Utils.h"
-#include "mlir/Dialect/SCF/SCF.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BlockAndValueMapping.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/RegionUtils.h"
-
-#define DEBUG_TYPE "split-dispatch-function"
-
-namespace mlir {
-namespace iree_compiler {
-
-//===----------------------------------------------------------------------===//
-// Utility functions
-//===----------------------------------------------------------------------===//
-
-/// Returns true if an op can be fused with the list of ops that are to be put
-/// in the same entry point function. This should be consistent with whatthe
-/// downstream passes can handle.
-static bool isFusableWithCurrentOpsList(
-    Operation *nextOp, ArrayRef<Operation *> currOpsList,
-    const linalg::LinalgDependenceGraph &dependenceGraph) {
-  if (currOpsList.empty()) return true;
-
-  linalg::LinalgOp dstOp = dyn_cast<linalg::LinalgOp>(nextOp);
-  linalg::LinalgOp srcOp = dyn_cast<linalg::LinalgOp>(currOpsList.back());
-  if (dstOp && srcOp) {
-    // TODO(#2963): This splits independent linalg opreations into its own
-    // dispatch, but in reality if the iteration domain of the ops are the same,
-    // and they have all iterator types parallel, they could be put in the same
-    // dispatch region.
-    if (!dependenceGraph.hasDependenceFrom(srcOp, dstOp)) return false;
-
-#define ADD_FUSABLE_PAIR(SrcOpTy, DstOpTy, DependenceTy)             \
-  if (isa<SrcOpTy>(srcOp.getOperation()) &&                          \
-      isa<DstOpTy>(dstOp.getOperation()) &&                          \
-      dependenceGraph.hasDependenceFrom(srcOp, dstOp, DependenceTy)) \
-    return true;
-
-    ADD_FUSABLE_PAIR(linalg::BatchMatmulOp, linalg::GenericOp,
-                     linalg::LinalgDependenceGraph::DependenceType::RAW)
-    ADD_FUSABLE_PAIR(linalg::FillOp, linalg::BatchMatmulOp,
-                     linalg::LinalgDependenceGraph::DependenceType::WAW)
-    ADD_FUSABLE_PAIR(linalg::FillOp, linalg::ConvInputNWCFilterWCFOp,
-                     linalg::LinalgDependenceGraph::DependenceType::WAW)
-    ADD_FUSABLE_PAIR(linalg::FillOp, linalg::ConvInputNHWCFilterHWCFOp,
-                     linalg::LinalgDependenceGraph::DependenceType::WAW)
-    ADD_FUSABLE_PAIR(linalg::FillOp, linalg::ConvInputNDHWCFilterDHWCFOp,
-                     linalg::LinalgDependenceGraph::DependenceType::WAW)
-    ADD_FUSABLE_PAIR(linalg::FillOp, linalg::DepthwiseConvInputNHWCFilterHWCOp,
-                     linalg::LinalgDependenceGraph::DependenceType::WAW)
-    ADD_FUSABLE_PAIR(linalg::FillOp, linalg::DepthwiseConvInputNHWCFilterHWCFOp,
-                     linalg::LinalgDependenceGraph::DependenceType::WAW)
-    ADD_FUSABLE_PAIR(linalg::FillOp, linalg::MatmulOp,
-                     linalg::LinalgDependenceGraph::DependenceType::WAW)
-    ADD_FUSABLE_PAIR(linalg::FillOp, linalg::BatchMatmulOp,
-                     linalg::LinalgDependenceGraph::DependenceType::WAW)
-    ADD_FUSABLE_PAIR(linalg::FillOp, linalg::PoolingNHWCMaxFOp,
-                     linalg::LinalgDependenceGraph::DependenceType::WAW)
-    ADD_FUSABLE_PAIR(linalg::FillOp, linalg::PoolingNHWCMinFOp,
-                     linalg::LinalgDependenceGraph::DependenceType::WAW)
-    ADD_FUSABLE_PAIR(linalg::FillOp, linalg::PoolingNHWCSumFOp,
-                     linalg::LinalgDependenceGraph::DependenceType::WAW)
-    ADD_FUSABLE_PAIR(linalg::MatmulOp, linalg::GenericOp,
-                     linalg::LinalgDependenceGraph::DependenceType::RAW)
-
-#undef ADD_FUSABLE_PAIR
-  }
-  return false;
-}
-
-/// For the list of operations in `ops` returns a list of lists where each list
-/// contains the operations that need to be put in a separate dispatch function.
-static LogicalResult separateOps(
-    ArrayRef<Operation *> ops,
-    const linalg::LinalgDependenceGraph &dependenceGraph,
-    SmallVectorImpl<SmallVector<Operation *, 1>> &fusedOpList) {
-  assert(!ops.empty() &&
-         "expected at least one separable op for splitting dispatch function");
-  SmallVector<Operation *, 1> currList;
-  for (auto currOp = ops.begin(), nextOp = std::next(ops.begin());
-       nextOp != ops.end(); ++currOp, ++nextOp) {
-    // Check that the operation has buffer semantics.
-    if (auto linalgOp = dyn_cast<linalg::LinalgOp>(*currOp)) {
-      if (!linalgOp.hasBufferSemantics()) return failure();
-    }
-
-    // Require no other non-metadata ops interleave with Linalg structured ops
-    // for now. This is the common case and it simplifies further analysis.
-    Operation *iter = (*currOp)->getNextNode();
-    while (iter != *nextOp && (MemoryEffectOpInterface::hasNoEffect(iter) ||
-                               isa<IREE::PlaceholderOp>(iter)))
-      iter = iter->getNextNode();
-    if (iter != *nextOp) return failure();
-
-    currList.push_back(*currOp);
-
-    // If the nextOp is not fusible with the currOp, then record the list of ops
-    // so far, and start a new list.
-    if (isFusableWithCurrentOpsList(*nextOp, currList, dependenceGraph)) {
-      continue;
-    }
-
-    // Push the current list of ops into the list of lists `currList` and
-    // start a new list.
-    fusedOpList.emplace_back();
-    std::swap(fusedOpList.back(), currList);
-  }
-  currList.push_back(ops.back());
-  fusedOpList.emplace_back(std::move(currList));
-  return success();
-}
-
-/// Recursively collects all the operations that are referenced by given
-/// `rootOp` into `closure`.
-static void collectAllReferencedOps(
-    ArrayRef<Operation *> rootOps,
-    llvm::SmallPtrSetImpl<Operation *> &closure) {
-  llvm::SmallVector<Operation *, 8> workList;
-  workList.assign(rootOps.begin(), rootOps.end());
-
-  while (!workList.empty()) {
-    Operation *curOp = workList.pop_back_val();
-    if (!curOp) continue;
-    if (!closure.insert(curOp).second) continue;  // Seen before
-    // Collect all defining ops for operands.
-    for (Value operand : curOp->getOperands()) {
-      if (Operation *owner = operand.getDefiningOp()) workList.push_back(owner);
-    }
-    // Collect all defining ops for the values used in regions.
-    for (Region &region : curOp->getRegions()) {
-      visitUsedValuesDefinedAbove(region, [&workList](OpOperand *operand) {
-        workList.push_back(operand->get().getDefiningOp());
-      });
-    }
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Pass and patterns
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-struct SplitDispatchFunctionPass
-    : public PassWrapper<SplitDispatchFunctionPass,
-                         OperationPass<IREE::HAL::ExecutableTargetOp>> {
-  void runOnOperation() override;
-  LogicalResult splitDispatchFunction(FuncOp oldFn, OpBuilder &builder);
-};
-
-}  // namespace
-
-void SplitDispatchFunctionPass::runOnOperation() {
-  IREE::HAL::ExecutableTargetOp targetOp = getOperation();
-  ModuleOp moduleOp = targetOp.getInnerModule();
-
-  // Collect all dispatch entry functions.
-  SmallVector<FuncOp, 1> functions;
-  for (FuncOp fn : moduleOp.getOps<FuncOp>()) {
-    if (isEntryPoint(fn)) functions.push_back(fn);
-  }
-  if (functions.empty()) return;
-  if (functions.size() > 1) {
-    moduleOp.emitError("expected only one entry function");
-    return signalPassFailure();
-  }
-
-  auto builder = OpBuilder::atBlockBegin(moduleOp.getBody());
-  if (failed(splitDispatchFunction(functions.front(), builder))) {
-    return signalPassFailure();
-  }
-}
-
-LogicalResult SplitDispatchFunctionPass::splitDispatchFunction(
-    FuncOp oldFn, OpBuilder &builder) {
-  // Entry functions are supported to be of `void(void)` type.
-  assert(oldFn.getType().getNumInputs() == 0 &&
-         oldFn.getType().getNumResults() == 0);
-
-  if (!llvm::hasSingleElement(oldFn.getBlocks())) {
-    return oldFn.emitError("expected only one block");
-  }
-  IREE::HAL::ExecutableEntryPointOp oldEntryPointOp = getEntryPoint(oldFn);
-  if (!oldEntryPointOp) {
-    return oldFn.emitError("unable to find iree.executable.entry_point for ")
-           << oldFn.getName();
-  }
-  // The dispatch function should have more than one separable ops. Otherwise
-  // there is nothing to do.
-  Block &fnBody = oldFn.getBlocks().front();
-
-  // Collect all Linalg and scf.parallel ops for splitting.
-  SmallVector<Operation *, 4> separableOps;
-  for (Operation &op : fnBody)
-    if (isa<linalg::LinalgOp, scf::ParallelOp, scf::ForOp>(op))
-      separableOps.push_back(&op);
-
-  if (separableOps.size() <= 1) return success();
-
-  linalg::Aliases aliases;
-  linalg::LinalgDependenceGraph dependenceGraph =
-      linalg::LinalgDependenceGraph::buildDependenceGraph(aliases, oldFn);
-  SmallVector<SmallVector<Operation *, 1>, 1> fusedOpsList;
-  if (failed(separateOps(separableOps, dependenceGraph, fusedOpsList))) {
-    return oldFn.emitError(
-        "cannot separate Linalg/Parallel ops into multiple kernels");
-  }
-  if (fusedOpsList.size() <= 1) return success();
-
-  ModuleOp moduleOp = cast<ModuleOp>(oldFn->getParentOp());
-  Block &oldFnBlock = oldFn.getBlocks().front();
-  Location loc = oldFn.getLoc();
-  SmallVector<Attribute, 4> entryPoints;
-
-  for (const auto &fusedOps : llvm::enumerate(fusedOpsList)) {
-    if (fusedOps.value().empty()) continue;
-    // Create a new function for hosting this op.
-    std::string newFnName =
-        llvm::formatv("{0}_dispatch_{1}", oldFn.getName(), fusedOps.index());
-    builder.setInsertionPointToStart(moduleOp.getBody());
-    auto newFn = builder.create<FuncOp>(loc, newFnName, oldFn.getType());
-    LLVM_DEBUG({
-      llvm::dbgs() << "Created new function : func @" << newFn.getName()
-                   << "\n";
-    });
-
-    // Copy over all attributes except type and name.
-    for (const auto &namedAttr : oldFn->getAttrs()) {
-      if (namedAttr.first != impl::getTypeAttrName() &&
-          namedAttr.first != SymbolTable::getSymbolAttrName())
-        newFn->setAttr(namedAttr.first, namedAttr.second);
-    }
-
-    // Add the entry point operations for the new fn.
-    {
-      OpBuilder::InsertionGuard g(builder);
-      builder.setInsertionPoint(oldEntryPointOp);
-      auto clonedEntryPointOp = cast<IREE::HAL::ExecutableEntryPointOp>(
-          builder.clone(*oldEntryPointOp.getOperation()));
-      clonedEntryPointOp.sym_nameAttr(builder.getStringAttr(newFnName));
-      clonedEntryPointOp.ordinalAttr(
-          builder.getIndexAttr(static_cast<int32_t>(entryPoints.size())));
-      entryPoints.push_back(builder.getSymbolRefAttr(clonedEntryPointOp));
-    }
-
-    // Collect the closure for the current Linalg op.
-    llvm::SmallPtrSet<Operation *, 16> closure;
-    collectAllReferencedOps(fusedOps.value(), closure);
-
-    // Clone all ops in the closure to the new function.
-    Block *newFnBlock = newFn.addEntryBlock();
-    builder.setInsertionPointToStart(newFnBlock);
-    BlockAndValueMapping remapper;
-    for (Operation &op : oldFnBlock) {
-      if (closure.count(&op) == 0) continue;
-      builder.insert(op.clone(remapper));
-      if (&op == fusedOps.value().back()) break;
-    }
-    builder.insert(oldFnBlock.getTerminator()->clone(remapper));
-  }
-  moduleOp->setAttr(getEntryPointScheduleAttrName(),
-                    builder.getArrayAttr(entryPoints));
-
-  LLVM_DEBUG({ llvm::dbgs() << "Erased func @" << oldFn.getName() << "\n"; });
-  oldFn.erase();
-  oldEntryPointOp.erase();
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// Pass entry point and registration
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableTargetOp>>
-createSplitDispatchFunctionPass() {
-  return std::make_unique<SplitDispatchFunctionPass>();
-}
-
-static PassRegistration<SplitDispatchFunctionPass> pass(
-    "iree-codegen-split-dispatch-function",
-    "Split workload to multiple dispatch functions to satisfy computation "
-    "dependency for GPU lowering");
-
-}  // namespace iree_compiler
-}  // namespace mlir

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/TileAndVectorizeInOneWorkgroupPass.cpp b/iree/compiler/Conversion/LinalgToSPIRV/TileAndVectorizeInOneWorkgroupPass.cpp
index 6e7a712..8fc4463 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/TileAndVectorizeInOneWorkgroupPass.cpp
+++ b/iree/compiler/Conversion/LinalgToSPIRV/TileAndVectorizeInOneWorkgroupPass.cpp

@@ -22,7 +22,6 @@
 #include "iree/compiler/Conversion/CodegenUtils/FunctionUtils.h"
 #include "iree/compiler/Conversion/CodegenUtils/MarkerUtils.h"
 #include "iree/compiler/Conversion/CodegenUtils/TransformUtils.h"
-#include "iree/compiler/Conversion/Common/Attributes.h"
 #include "iree/compiler/Conversion/Common/Transforms.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/CodeGenOptionUtils.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/KernelDispatchUtils.h"

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/VectorizeMemrefLoadStorePass.cpp b/iree/compiler/Conversion/LinalgToSPIRV/VectorizeMemrefLoadStorePass.cpp
index 00ae4c9..499c38e 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/VectorizeMemrefLoadStorePass.cpp
+++ b/iree/compiler/Conversion/LinalgToSPIRV/VectorizeMemrefLoadStorePass.cpp

@@ -418,8 +418,9 @@
   }
 };
 
-class VectorizeMemRefPass final
-    : public PassWrapper<VectorizeMemRefPass, OperationPass<ModuleOp>> {
+class VectorizeMemRefLoadStorePass final
+    : public PassWrapper<VectorizeMemRefLoadStorePass,
+                         OperationPass<ModuleOp>> {
   void runOnOperation() override;
 
  private:
@@ -455,7 +456,7 @@
   return success();
 }
 
-void VectorizeMemRefPass::runOnOperation() {
+void VectorizeMemRefLoadStorePass::runOnOperation() {
   // Uses the signature conversion methodology of the dialect conversion
   // framework to implement the conversion.
   ModuleOp module = getOperation();
@@ -504,10 +505,10 @@
 }
 
 std::unique_ptr<OperationPass<ModuleOp>> createVectorizeMemrefLoadStorePass() {
-  return std::make_unique<VectorizeMemRefPass>();
+  return std::make_unique<VectorizeMemRefLoadStorePass>();
 }
 
-static PassRegistration<VectorizeMemRefPass> pass(
+static PassRegistration<VectorizeMemRefLoadStorePass> pass(
     "iree-spirv-vectorize-memref-load-store",
     "Vectorize interface memrefs and their load/store for better memory "
     "access");

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/test/BUILD b/iree/compiler/Conversion/LinalgToSPIRV/test/BUILD
index 2ad8790..090dab0 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/test/BUILD
+++ b/iree/compiler/Conversion/LinalgToSPIRV/test/BUILD

@@ -36,11 +36,9 @@
             "forop_canonicalization.mlir",
             "materialize_launch_configuration.mlir",
             "materialize_launch_configuration2.mlir",
-            "matmul_vectorization_licm.mlir",
             "pipeline_matmul_cooperative_matrix.mlir",
             "pipeline_matmul_vectorization.mlir",
             "promote_workgroup_memory.mlir",
-            "split_dispatch_function.mlir",
             "tile_and_vectorize_batch_matmul.mlir",
             "tile_and_vectorize_conv.mlir",
             "tile_and_vectorize_matmul.mlir",

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/test/CMakeLists.txt b/iree/compiler/Conversion/LinalgToSPIRV/test/CMakeLists.txt
index 2d60bba..b53451a 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/test/CMakeLists.txt
+++ b/iree/compiler/Conversion/LinalgToSPIRV/test/CMakeLists.txt

@@ -23,11 +23,9 @@
     "forop_canonicalization.mlir"
     "materialize_launch_configuration.mlir"
     "materialize_launch_configuration2.mlir"
-    "matmul_vectorization_licm.mlir"
     "pipeline_matmul_cooperative_matrix.mlir"
     "pipeline_matmul_vectorization.mlir"
     "promote_workgroup_memory.mlir"
-    "split_dispatch_function.mlir"
     "tile_and_vectorize_batch_matmul.mlir"
     "tile_and_vectorize_conv.mlir"
     "tile_and_vectorize_matmul.mlir"

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/test/materialize_launch_configuration.mlir b/iree/compiler/Conversion/LinalgToSPIRV/test/materialize_launch_configuration.mlir
index ba4a9e7..6cf8c8f 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/test/materialize_launch_configuration.mlir
+++ b/iree/compiler/Conversion/LinalgToSPIRV/test/materialize_launch_configuration.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt -pass-pipeline="hal.executable(hal.executable.target(iree-spirv-concretize-tile-among-workgroups))" -iree-codegen-spirv-experimental-linalg-on-tensors -cse -canonicalize -split-input-file %s | IreeFileCheck %s
+// RUN: iree-opt -pass-pipeline="hal.executable(hal.executable.target(iree-spirv-concretize-tile-among-workgroups))" -canonicalize -cse -split-input-file %s | IreeFileCheck %s
 
 hal.executable @matmul_tensors attributes {sym_visibility = "private"} {
   hal.interface @io {

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/test/materialize_launch_configuration2.mlir b/iree/compiler/Conversion/LinalgToSPIRV/test/materialize_launch_configuration2.mlir
index 30a1b85..81b3b9d 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/test/materialize_launch_configuration2.mlir
+++ b/iree/compiler/Conversion/LinalgToSPIRV/test/materialize_launch_configuration2.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt -pass-pipeline="hal.executable(hal.executable.target(iree-codegen-convert-to-gpu))" -iree-codegen-spirv-experimental-linalg-on-tensors -cse -canonicalize -split-input-file %s | IreeFileCheck %s
+// RUN: iree-opt -pass-pipeline="hal.executable(hal.executable.target(iree-codegen-convert-to-gpu))" -canonicalize -cse -split-input-file %s | IreeFileCheck %s
 
 hal.executable @add attributes {sym_visibility = "private"} {
   hal.interface @io {

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/test/matmul_vectorization_licm.mlir b/iree/compiler/Conversion/LinalgToSPIRV/test/matmul_vectorization_licm.mlir
deleted file mode 100644
index 223fe2b..0000000
--- a/iree/compiler/Conversion/LinalgToSPIRV/test/matmul_vectorization_licm.mlir
+++ /dev/null

@@ -1,21 +0,0 @@
-// RUN: iree-opt --iree-codegen-linalg-to-gpu-matmul-vectorization-pass
-// RUN: -split-input-file %s --iree-codegen-linalg-to-gpu-unroll-size=8,8,32 \
-// RUN: -iree-codegen-linalg-to-gpu-matmul-licm | IreeFileCheck %s
-
-// CHECK-LABEL: func @matmul_128x128x128
-// CHECK-SAME: (%[[ARG0:.+]]: memref<128x128xf32>, %[[ARG1:.+]]: memref<128x128xf32>, %[[ARG2:.+]]: memref<128x128xf32>)
-func @matmul_128x128x128(%arg0 : memref<128x128xf32>, %arg1: memref<128x128xf32>, %arg2: memref<128x128xf32>) {
-    linalg.matmul ins(%arg0, %arg1 : memref<128x128xf32>, memref<128x128xf32>) outs(%arg2 : memref<128x128xf32>)
-    return
-}
-
-// CHECK-DAG: %[[TILESIZE:.+]] = constant 32 : index
-// CHECK-DAG: %[[MATSIZE:.+]] = constant 128 : index
-// CHECK-DAG: %[[START:.+]] = constant 0 : index
-//     CHECK: scf.for %[[IL:.+]] = %[[START]] to %[[MATSIZE]] step %[[TILESIZE]]
-//     CHECK:   scf.for %[[JL:.+]] = %[[START]] to %[[MATSIZE]] step %[[TILESIZE]]
-//     CHECK:     %[[SUBVVIEWC:.+]] = memref.subview %[[ARG2]][%[[IL]], %[[JL]]] [32, 32] [1, 1] : memref<128x128xf32> to memref<32x32xf32
-//     CHECK:     scf.for %[[KL:.+]] = %[[START]] to %[[MATSIZE]] step %[[TILESIZE]]
-//     CHECK:       %[[SUBVVIEWA:.+]] = memref.subview %[[ARG0]][%[[IL]], %[[KL]]] [32, 32] [1, 1] : memref<128x128xf32> to memref<32x32xf32
-//     CHECK:       %[[SUBVVIEWB:.+]] = memref.subview %[[ARG1]][%[[KL]], %[[JL]]] [32, 32] [1, 1] : memref<128x128xf32> to memref<32x32xf32
-

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/test/split_dispatch_function.mlir b/iree/compiler/Conversion/LinalgToSPIRV/test/split_dispatch_function.mlir
deleted file mode 100644
index 977d0c3..0000000
--- a/iree/compiler/Conversion/LinalgToSPIRV/test/split_dispatch_function.mlir
+++ /dev/null

@@ -1,692 +0,0 @@
-// RUN: iree-opt -allow-unregistered-dialect -split-input-file -pass-pipeline='hal.executable(hal.executable.target(iree-codegen-split-dispatch-function))' -verify-diagnostics %s | IreeFileCheck %s
-
-hal.executable @kernel_fusable_fill_conv1d_ops attributes {sym_visiblity = "private"} {
-  hal.interface @io {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-  }
-  hal.executable.target @vulkan, filter="vulkan*" {
-    hal.executable.entry_point @kernel_fusable_fill_conv1d_ops attributes {
-      interface = @io, ordinal = 0 : index,
-      signature = (!flow.dispatch.tensor<readonly:?x3x512xf32>, !flow.dispatch.tensor<readonly:3x512x1xf32>,
-        !flow.dispatch.tensor<writeonly:?x1x512xf32>) -> ()}
-    module {
-      //     CHECK: func @kernel_fusable_fill_conv1d_ops
-      //     CHECK:   linalg.fill
-      // CHECK-NOT:   return
-      //     CHECK:   linalg.conv_1d_input_nwc_filter_wcf
-      //     CHECK:   return
-
-      func @kernel_fusable_fill_conv1d_ops() {
-        %cst = constant 0.000000e+00 : f32
-        %dim = hal.interface.load.constant offset = 0 : index
-        %shape1 = shapex.make_ranked_shape %dim : (index) -> !shapex.ranked_shape<[?,3,512]>
-        %shape2 = shapex.make_ranked_shape %dim : (index) -> !shapex.ranked_shape<[?,1,512]>
-        %0 = iree.placeholder for "interface buffer" {binding = @io::@arg0} : memref<?x3x512xf32>
-        %ts1 = shapex.tie_shape %0, %shape1 : memref<?x3x512xf32>, !shapex.ranked_shape<[?,3,512]>
-        %1 = iree.placeholder for "interface buffer" {binding = @io::@arg1} : memref<3x512x1xf32>
-        %2 = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<?x1x512xf32>
-        %ts2 = shapex.tie_shape %2, %shape2 : memref<?x1x512xf32>, !shapex.ranked_shape<[?,1,512]>
-        linalg.fill(%ts2, %cst) : memref<?x1x512xf32>, f32
-        linalg.conv_1d_input_nwc_filter_wcf {
-          dilations = dense<1> : tensor<1xi64>,
-          strides = dense<2> : tensor<1xi64>}
-           ins(%ts1, %1 : memref<?x3x512xf32>, memref<3x512x1xf32>)
-          outs(%ts2 : memref<?x1x512xf32>)
-        return
-      }
-      hal.interface @io attributes {push_constants = 1 : index, sym_visibility = "private"} {
-        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-        hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-        hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-      }
-    }
-  }
-}
-// -----
-
-hal.executable @kernel_fusable_fill_conv2d_ops attributes {sym_visiblity = "private"} {
-  hal.interface @io {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-  }
-  hal.executable.target @vulkan, filter="vulkan*" {
-    hal.executable.entry_point @kernel_fusable_fill_conv2d_ops attributes {
-      interface = @io, ordinal = 0 : index,
-      signature = (!flow.dispatch.tensor<readonly:?x3x3x512xf32>, !flow.dispatch.tensor<readonly:3x3x512x1xf32>,
-        !flow.dispatch.tensor<writeonly:?x1x1x512xf32>) -> ()}
-    module {
-      //     CHECK: func @kernel_fusable_fill_conv2d_ops
-      //     CHECK:   linalg.fill
-      // CHECK-NOT:   return
-      //     CHECK:   linalg.conv_2d_input_nhwc_filter_hwcf
-      //     CHECK:   return
-
-      func @kernel_fusable_fill_conv2d_ops() {
-        %cst = constant 0.000000e+00 : f32
-        %dim = hal.interface.load.constant offset = 0 : index
-        %shape1 = shapex.make_ranked_shape %dim : (index) -> !shapex.ranked_shape<[?,3,3,512]>
-        %shape2 = shapex.make_ranked_shape %dim : (index) -> !shapex.ranked_shape<[?,1,1,512]>
-        %0 = iree.placeholder for "interface buffer" {binding = @io::@arg0} : memref<?x3x3x512xf32>
-        %ts1 = shapex.tie_shape %0, %shape1 : memref<?x3x3x512xf32>, !shapex.ranked_shape<[?,3,3,512]>
-        %1 = iree.placeholder for "interface buffer" {binding = @io::@arg1} : memref<3x3x512x1xf32>
-        %2 = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<?x1x1x512xf32>
-        %ts2 = shapex.tie_shape %2, %shape2 : memref<?x1x1x512xf32>, !shapex.ranked_shape<[?,1,1,512]>
-        linalg.fill(%ts2, %cst) : memref<?x1x1x512xf32>, f32
-        linalg.conv_2d_input_nhwc_filter_hwcf {
-          dilations = dense<1> : tensor<2xi64>,
-          strides = dense<2> : tensor<2xi64>}
-           ins(%ts1, %1 : memref<?x3x3x512xf32>, memref<3x3x512x1xf32>)
-          outs(%ts2 : memref<?x1x1x512xf32>)
-        return
-      }
-      hal.interface @io attributes {push_constants = 1 : index, sym_visibility = "private"} {
-        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-        hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-        hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-      }
-    }
-  }
-}
-
-// -----
-
-hal.executable @kernel_fusable_fill_conv3d_ops attributes {sym_visiblity = "private"} {
-  hal.interface @io {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-  }
-  hal.executable.target @vulkan, filter="vulkan*" {
-    hal.executable.entry_point @kernel_fusable_fill_conv3d_ops attributes {
-      interface = @io, ordinal = 0 : index,
-      signature = (!flow.dispatch.tensor<readonly:?x3x3x3x512xf32>, !flow.dispatch.tensor<readonly:3x3x3x512x1xf32>,
-        !flow.dispatch.tensor<writeonly:?x1x1x1x512xf32>) -> ()}
-    module {
-      //     CHECK: func @kernel_fusable_fill_conv3d_ops
-      //     CHECK:   linalg.fill
-      // CHECK-NOT:   return
-      //     CHECK:   linalg.conv_3d_input_ndhwc_filter_dhwcf
-      //     CHECK:   return
-
-      func @kernel_fusable_fill_conv3d_ops() {
-        %cst = constant 0.000000e+00 : f32
-        %dim = hal.interface.load.constant offset = 0 : index
-        %shape1 = shapex.make_ranked_shape %dim : (index) -> !shapex.ranked_shape<[?,3,3,3,512]>
-        %shape2 = shapex.make_ranked_shape %dim : (index) -> !shapex.ranked_shape<[?,1,1,1,512]>
-        %0 = iree.placeholder for "interface buffer" {binding = @io::@arg0} : memref<?x3x3x3x512xf32>
-        %ts1 = shapex.tie_shape %0, %shape1 : memref<?x3x3x3x512xf32>, !shapex.ranked_shape<[?,3,3,3,512]>
-        %1 = iree.placeholder for "interface buffer" {binding = @io::@arg1} : memref<3x3x3x512x1xf32>
-        %2 = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<?x1x1x1x512xf32>
-        %ts2 = shapex.tie_shape %2, %shape2 : memref<?x1x1x1x512xf32>, !shapex.ranked_shape<[?,1,1,1,512]>
-        linalg.fill(%ts2, %cst) : memref<?x1x1x1x512xf32>, f32
-        linalg.conv_3d_input_ndhwc_filter_dhwcf {
-          dilations = dense<1> : tensor<3xi64>,
-          strides = dense<2> : tensor<3xi64>}
-           ins(%ts1, %1 : memref<?x3x3x3x512xf32>, memref<3x3x3x512x1xf32>)
-          outs(%ts2 : memref<?x1x1x1x512xf32>)
-        return
-      }
-      hal.interface @io attributes {push_constants = 1 : index, sym_visibility = "private"} {
-        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-        hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-        hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-      }
-    }
-  }
-}
-
-// -----
-
-hal.executable @kernel_fusable_fill_matmul_ops attributes {sym_visiblity = "private"} {
-  hal.interface @io {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-  }
-  hal.executable.target @vulkan, filter="vulkan*" {
-    hal.executable.entry_point @kernel_fusable_fill_matmul_ops attributes {
-      interface = @io, ordinal = 0 : index,
-      signature = (!flow.dispatch.tensor<readonly:?x512xf32>, !flow.dispatch.tensor<readonly:512x?xf32>,
-        !flow.dispatch.tensor<writeonly:?x?xf32>) -> ()}
-    module {
-      //     CHECK: func @kernel_fusable_fill_matmul_ops
-      //     CHECK:   linalg.fill
-      // CHECK-NOT:   return
-      //     CHECK:   linalg.matmul
-      //     CHECK:   return
-
-      func @kernel_fusable_fill_matmul_ops() {
-        %cst = constant 0.000000e+00 : f32
-        %dimM = hal.interface.load.constant offset = 0 : index
-        %dimN = hal.interface.load.constant offset = 1 : index
-        %shape1 = shapex.make_ranked_shape %dimM : (index) -> !shapex.ranked_shape<[?,512]>
-        %shape2 = shapex.make_ranked_shape %dimN : (index) -> !shapex.ranked_shape<[512,?]>
-        %shape3 = shapex.make_ranked_shape %dimM, %dimN : (index, index) -> !shapex.ranked_shape<[?,?]>
-        %0 = iree.placeholder for "interface buffer" {binding = @io::@arg0} : memref<?x512xf32>
-        %ts1 = shapex.tie_shape %0, %shape1 : memref<?x512xf32>, !shapex.ranked_shape<[?,512]>
-        %1 = iree.placeholder for "interface buffer" {binding = @io::@arg1} : memref<512x?xf32>
-        %ts2 = shapex.tie_shape %1, %shape2 : memref<512x?xf32>, !shapex.ranked_shape<[512, ?]>
-        %2 = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<?x?xf32>
-        %ts3 = shapex.tie_shape %2, %shape3 : memref<?x?xf32>, !shapex.ranked_shape<[?,?]>
-        linalg.fill(%ts3, %cst) : memref<?x?xf32>, f32
-        linalg.matmul ins(%ts1, %ts2 : memref<?x512xf32>, memref<512x?xf32>)
-                      outs(%ts3 : memref<?x?xf32>)
-        return
-      }
-      hal.interface @io attributes {push_constants = 1 : index, sym_visibility = "private"} {
-        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-        hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-        hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-      }
-    }
-  }
-}
-
-// -----
-
-hal.executable @kernel_fusable_pooling attributes {sym_visiblity = "private"} {
-  hal.interface @io {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-  }
-  hal.executable.target @vulkan, filter="vulkan*" {
-    hal.executable.entry_point @kernel_fusable_pooling attributes {
-      interface = @io, ordinal = 0 : index,
-      signature = (!flow.dispatch.tensor<readonly:?x?xf32>, !flow.dispatch.tensor<readonly:?x?x?x?xf32>,
-        !flow.dispatch.tensor<writeonly:?x?x?x?xf32>) -> ()}
-    module {
-      //     CHECK: func @kernel_fusable_pooling()
-      //     CHECK:   linalg.fill
-      // CHECK-NOT:   return
-      //     CHECK:   linalg.pooling_nhwc_sum
-      //     CHECK:   return
-      func @kernel_fusable_pooling() {
-        %cst = constant 0.000000e+00 : f32
-        %0 = iree.placeholder for "interface buffer" {binding = @io::@arg0} : memref<?x?xf32>
-        %1 = iree.placeholder for "interface buffer" {binding = @io::@arg1} : memref<?x?x?x?xf32>
-        %2 = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<?x?x?x?xf32>
-        linalg.fill(%2, %cst) : memref<?x?x?x?xf32>, f32
-        linalg.pooling_nhwc_sum
-          {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>}
-          ins(%1, %0: memref<?x?x?x?xf32>, memref<?x?xf32>)
-          outs(%2: memref<?x?x?x?xf32>)
-        return
-      }
-      hal.interface @io attributes {sym_visibility = "private"} {
-        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-        hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-        hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-      }
-    }
-  }
-}
-
-// -----
-
-hal.executable @kernel attributes {sym_visiblity = "private"} {
-  hal.interface @io {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-  }
-  hal.executable.target @vulkan, filter="vulkan*" {
-    hal.executable.entry_point @kernel attributes {
-      interface = @io, ordinal = 0 : index,
-      signature = (!flow.dispatch.tensor<readonly:?x3x3x512xf32>, !flow.dispatch.tensor<readonly:3x3x512x1xf32>,
-        !flow.dispatch.tensor<writeonly:?x1x1x512xf32>) -> ()}
-    // CHECK: hal.executable.entry_point @kernel_dispatch_0
-    // CHECK: hal.executable.entry_point @kernel_dispatch_1
-    // CHECK: module attributes {hal.entry_point_schedule = [@kernel_dispatch_0, @kernel_dispatch_1]}
-    module {
-      // CHECK: func @kernel_dispatch_1()
-      // CHECK:   %[[ZERO:.+]] = constant
-      // CHECK:   %[[DIM:.+]] = hal.interface.load.constant
-      // CHECK:   %[[SHAPE:.+]] = shapex.make_ranked_shape %[[DIM]]
-      // CHECK:   %[[OUT:.+]] = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<?x1x1x512xf32>
-      // CHECK:   %[[TS:.+]] = shapex.tie_shape %[[OUT]], %[[SHAPE]]
-      // CHECK:   linalg.fill(%[[TS]], %[[ZERO]])
-      // CHECK:   return
-
-      // CHECK: func @kernel_dispatch_0()
-      // CHECK:   %[[DIM:.+]] = hal.interface.load.constant
-      // CHECK:   %[[SHAPE1:.+]] = shapex.make_ranked_shape %[[DIM]]
-      // CHECK:   %[[SHAPE2:.+]] = shapex.make_ranked_shape %[[DIM]]
-      // CHECK:   %[[IN1:.+]] = iree.placeholder for "interface buffer" {binding = @io::@arg0} : memref<?x3x3x512xf32>
-      // CHECK:   %[[TS1:.+]] = shapex.tie_shape %[[IN1]], %[[SHAPE1]]
-      // CHECK:   %[[IN2:.+]] = iree.placeholder for "interface buffer" {binding = @io::@arg1} : memref<3x3x512x1xf32>
-      // CHECK:   %[[OUT:.+]] = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<?x1x1x512xf32>
-      // CHECK:   %[[TS2:.+]] = shapex.tie_shape %[[OUT]], %[[SHAPE2]]
-      // CHECK:   linalg.conv_2d_input_nhwc_filter_hwcf
-      // CHECK-SAME: ins(%[[TS1]], %[[IN2]] : memref<?x3x3x512xf32>, memref<3x3x512x1xf32>)
-      // CHECK-SAME: outs(%[[TS2]] : memref<?x1x1x512xf32>)
-      // CHECK:   return
-
-      func @kernel() {
-        %cst = constant 0.000000e+00 : f32
-        %dim = hal.interface.load.constant offset = 0 : index
-        %shape1 = shapex.make_ranked_shape %dim : (index) -> !shapex.ranked_shape<[?,3,3,512]>
-        %shape2 = shapex.make_ranked_shape %dim : (index) -> !shapex.ranked_shape<[?,1,1,512]>
-        %0 = iree.placeholder for "interface buffer" {binding = @io::@arg0} : memref<?x3x3x512xf32>
-        %ts1 = shapex.tie_shape %0, %shape1 : memref<?x3x3x512xf32>, !shapex.ranked_shape<[?,3,3,512]>
-        %1 = iree.placeholder for "interface buffer" {binding = @io::@arg1} : memref<3x3x512x1xf32>
-        %2 = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<?x1x1x512xf32>
-        %ts2 = shapex.tie_shape %2, %shape2 : memref<?x1x1x512xf32>, !shapex.ranked_shape<[?,1,1,512]>
-        linalg.conv_2d_input_nhwc_filter_hwcf {
-          dilations = dense<1> : tensor<2xi64>,
-          strides = dense<2> : tensor<2xi64>}
-           ins(%ts1, %1 : memref<?x3x3x512xf32>, memref<3x3x512x1xf32>)
-          outs(%ts2 : memref<?x1x1x512xf32>)
-        linalg.fill(%ts2, %cst) : memref<?x1x1x512xf32>, f32
-        return
-      }
-      hal.interface @io attributes {push_constants = 1 : index, sym_visibility = "private"} {
-        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-        hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-        hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-      }
-    }
-  }
-}
-
-// -----
-
-hal.executable @kernel attributes {sym_visiblity = "private"} {
-  hal.interface @io {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-  }
-  hal.executable.target @vulkan, filter="vulkan*" {
-    hal.executable.entry_point @kernel attributes {
-      interface = @io, ordinal = 0 : index,
-      signature = (!flow.dispatch.tensor<readonly:?x3x3x512xf32>, !flow.dispatch.tensor<readonly:3x3x512x1xf32>,
-        !flow.dispatch.tensor<writeonly:?x1x1x512xf32>) -> ()}
-    // CHECK: hal.executable.entry_point @kernel_dispatch_0
-    // CHECK: hal.executable.entry_point @kernel_dispatch_1
-    // CHECK: hal.executable.entry_point @kernel_dispatch_2
-    // CHECK: module attributes {hal.entry_point_schedule = [@kernel_dispatch_0, @kernel_dispatch_1, @kernel_dispatch_2]}
-    module {
-    //      CHECK: func @kernel_dispatch_2()
-    //      CHECK:   %[[DIM:.+]] = hal.interface.load.constant
-    //      CHECK:   %[[SHAPE1:.+]] = shapex.make_ranked_shape %[[DIM]]
-    //      CHECK:   %[[SHAPE2:.+]] = shapex.make_ranked_shape %[[DIM]]
-    //      CHECK:   %[[IN1:.+]] = iree.placeholder for "interface buffer" {binding = @io::@arg0} : memref<?x3x3x512xf32>
-    //      CHECK:   %[[TS1:.+]] = shapex.tie_shape %[[IN1]], %[[SHAPE1]]
-    //      CHECK:   %[[IN2:.+]] = iree.placeholder for "interface buffer" {binding = @io::@arg1} : memref<3x3x512x1xf32>
-    //      CHECK:   %[[OUT:.+]] = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<?x1x1x512xf32>
-    //      CHECK:   %[[TS2:.+]] = shapex.tie_shape %[[OUT]], %[[SHAPE2]]
-    //      CHECK:   linalg.conv_2d_input_nhwc_filter_hwcf
-    // CHECK-SAME:     ins(%[[TS1]], %[[IN2]] : memref<?x3x3x512xf32>, memref<3x3x512x1xf32>)
-    // CHECK-SAME:     outs(%[[TS2]] : memref<?x1x1x512xf32>)
-    //      CHECK:   return
-
-    //      CHECK: func @kernel_dispatch_1()
-    //      CHECK:   %[[C0:.+]] = constant 0 : index
-    //      CHECK:   %[[C1:.+]] = constant 1 : index
-    //      CHECK:   scf.parallel (%{{.*}}) = (%[[C0]]) to (%[[C1]]) step (%[[C1]])
-    //      CHECK:     scf.yield
-    //      CHECK:   return
-
-    //      CHECK: func @kernel_dispatch_0()
-    //      CHECK:   %[[ZERO:.+]] = constant
-    //      CHECK:   %[[DIM:.+]] = hal.interface.load.constant
-    //      CHECK:   %[[SHAPE:.+]] = shapex.make_ranked_shape %[[DIM]]
-    //      CHECK:   %[[OUT:.+]] = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<?x1x1x512xf32>
-    //      CHECK:   %[[TS:.+]] = shapex.tie_shape %[[OUT]], %[[SHAPE]]
-    //      CHECK:   linalg.fill(%[[TS]], %[[ZERO]])
-    //      CHECK:   return
-
-      func @kernel() {
-        %cst = constant 0.000000e+00 : f32
-        %c0 = constant 0 : index
-        %c1 = constant 1 : index
-        %dim = hal.interface.load.constant offset = 0 : index
-        %shape1 = shapex.make_ranked_shape %dim : (index) -> !shapex.ranked_shape<[?,3,3,512]>
-        %shape2 = shapex.make_ranked_shape %dim : (index) -> !shapex.ranked_shape<[?,1,1,512]>
-        %0 = iree.placeholder for "interface buffer" {binding = @io::@arg0} : memref<?x3x3x512xf32>
-        %ts1 = shapex.tie_shape %0, %shape1 : memref<?x3x3x512xf32>, !shapex.ranked_shape<[?,3,3,512]>
-        %1 = iree.placeholder for "interface buffer" {binding = @io::@arg1} : memref<3x3x512x1xf32>
-        %2 = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<?x1x1x512xf32>
-        %ts2 = shapex.tie_shape %2, %shape2 : memref<?x1x1x512xf32>, !shapex.ranked_shape<[?,1,1,512]>
-        linalg.fill(%ts2, %cst) : memref<?x1x1x512xf32>, f32
-        scf.parallel (%iv) = (%c0) to (%c1) step (%c1) {
-          scf.yield
-        }
-        linalg.conv_2d_input_nhwc_filter_hwcf {
-          dilations = dense<1> : tensor<2xi64>,
-          strides = dense<2> : tensor<2xi64>}
-           ins(%ts1, %1 : memref<?x3x3x512xf32>, memref<3x3x512x1xf32>)
-          outs(%ts2 : memref<?x1x1x512xf32>)
-        return
-      }
-      hal.interface @io attributes {push_constants = 1 : index, sym_visibility = "private"} {
-        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-        hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-        hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-      }
-    }
-  }
-}
-
-// -----
-
-// Nothing to do if there is just one Linalg op.
-
-hal.executable @kernel attributes {sym_visiblity = "private"} {
-  hal.interface @io {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-  }
-  hal.executable.target @vulkan, filter="vulkan*" {
-    hal.executable.entry_point @kernel attributes {
-      interface = @io, ordinal = 0 : index,
-      signature = (!flow.dispatch.tensor<readonly:1x3x3x512xf32>, !flow.dispatch.tensor<readonly:3x3x512x1xf32>,
-        !flow.dispatch.tensor<writeonly:1x1x1x1xf32>) -> ()}
-    // CHECK-NOT: hal.entry_point_schedule
-    module {
-      // CHECK-LABEL: @kernel()
-      func @kernel() attributes {hal.num_workgroups_fn = @kernel__num_workgroups__} {
-        %cst = constant 0.000000e+00 : f32
-        %0 = iree.placeholder for "interface buffer" {binding = @io::@arg0} : memref<1x3x3x512xf32>
-        %1 = iree.placeholder for "interface buffer" {binding = @io::@arg1} : memref<3x3x512x1xf32>
-        %2 = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<1x1x1x1xf32>
-        linalg.conv_2d_input_nhwc_filter_hwcf {
-          dilations = dense<1> : tensor<2xi64>,
-          strides = dense<2> : tensor<2xi64>}
-           ins(%0, %1 : memref<1x3x3x512xf32>, memref<3x3x512x1xf32>)
-          outs(%2 : memref<1x1x1x1xf32>)
-        return
-      }
-      // CHECK-LABEL: @kernel__num_workgroups__
-      hal.interface @io attributes {sym_visibility = "private"} {
-        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-        hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-        hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-      }
-    }
-  }
-}
-
-
-
-// -----
-
-// Do not split when Linalg and non-Linalg ops are interleaving each other.
-
-hal.executable @kernel attributes {sym_visiblity = "private"} {
-  hal.interface @io {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-  }
-  hal.executable.target @vulkan, filter="vulkan*" {
-    hal.executable.entry_point @kernel attributes {
-      interface = @io, ordinal = 0 : index,
-      signature = (!flow.dispatch.tensor<readonly:?x3x512xf32>, !flow.dispatch.tensor<readonly:3x512x1xf32>,
-        !flow.dispatch.tensor<writeonly:?x1x1xf32>) -> ()}
-    module {
-      // expected-error @+1 {{cannot separate Linalg/Parallel ops into multiple kernels}}
-      func @kernel() {
-        %cst = constant 0.000000e+00 : f32
-        %0 = iree.placeholder for "interface buffer" {binding = @io::@arg0} : memref<1x3x3x512xf32>
-        %1 = iree.placeholder for "interface buffer" {binding = @io::@arg1} : memref<3x3x512x1xf32>
-        %2 = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<1x1x1x1xf32>
-        linalg.fill(%2, %cst) : memref<1x1x1x1xf32>, f32
-        "some_op"() : () -> ()
-        linalg.conv_2d_input_nhwc_filter_hwcf {
-          dilations = dense<1> : tensor<2xi64>,
-          strides = dense<2> : tensor<2xi64>}
-           ins(%0, %1 : memref<1x3x3x512xf32>, memref<3x3x512x1xf32>)
-          outs(%2 : memref<1x1x1x1xf32>)
-        return
-      }
-      hal.interface @io attributes {sym_visibility = "private"} {
-        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-        hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-        hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-      }
-    }
-  }
-}
-
-// -----
-#map0 = affine_map<(d0, d1) -> (d0 * 12 + d1 + 53)>
-
-hal.executable @subview_interleaved attributes {sym_visiblity = "private"} {
-  hal.interface @io {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-  }
-  hal.executable.target @vulkan, filter="vulkan*" {
-    hal.executable.entry_point @subview_interleaved attributes {
-      interface = @io, ordinal = 0 : index,
-      signature = (!flow.dispatch.tensor<readonly:18x12xf32>, !flow.dispatch.tensor<writeonly:18x12xf32>) -> ()}
-    module {
-      func @subview_interleaved() {
-        %cst = constant 0.000000e+00 : f32
-        %0 = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<18x12xf32>
-        %1 = iree.placeholder for "interface buffer" {binding = @io::@arg0} : memref<18x12xf32>
-        linalg.fill(%0, %cst) : memref<18x12xf32>, f32
-        %2 = memref.subview %0[4, 5] [18, 12] [1, 1]  : memref<18x12xf32> to memref<18x12xf32, #map0>
-        linalg.copy(%1, %2) : memref<18x12xf32>, memref<18x12xf32, #map0>
-        return
-      }
-      hal.interface @io attributes {sym_visibility = "private"} {
-        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-        hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write"
-      }
-    }
-  }
-}
-
-//      CHECK: #[[MAP0:.+]] = affine_map<(d0, d1) -> (d0 * 12 + d1 + 53)>
-//  CHECK-DAG: hal.executable.entry_point @subview_interleaved_dispatch_0
-//  CHECK-DAG: hal.executable.entry_point @subview_interleaved_dispatch_1
-//      CHECK: module attributes {hal.entry_point_schedule =
-// CHECK-SAME:   [@subview_interleaved_dispatch_0, @subview_interleaved_dispatch_1]}
-//      CHECK: func @subview_interleaved_dispatch_1()
-//  CHECK-DAG:   %[[DST:.+]] = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<18x12xf32>
-//  CHECK-DAG:   %[[SRC:.+]] = iree.placeholder for "interface buffer" {binding = @io::@arg0} : memref<18x12xf32>
-//      CHECK:   %[[SUB:.+]] = memref.subview %[[DST]][4, 5] [18, 12] [1, 1]  : memref<18x12xf32> to memref<18x12xf32, #[[MAP0]]>
-//      CHECK:   linalg.copy(%[[SRC]], %[[SUB]]) : memref<18x12xf32>, memref<18x12xf32, #[[MAP0]]>
-//      CHECK:   return
-//      CHECK: func @subview_interleaved_dispatch_0()
-//      CHECK:   %[[CST:.+]] = constant
-//      CHECK:   %[[DST2:.+]] = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<18x12xf32>
-//      CHECK:   linalg.fill(%[[DST2]], %[[CST]]) : memref<18x12xf32>, f32
-//      CHECK:   return
-
-// -----
-
-#map0 = affine_map<(d0, d1) -> (d0, d1)>
-#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
-#map2 = affine_map<(d0, d1, d2) -> (d2)>
-
-hal.executable @reshape_interleaved attributes {sym_visiblity = "private"} {
-  hal.interface @io {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
-    hal.interface.binding @ret1, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-  }
-  hal.executable.target @vulkan, filter="vulkan*" {
-    hal.executable.entry_point @reshape_interleaved attributes {
-      interface = @io, ordinal = 0 : index,
-      signature = (!flow.dispatch.tensor<readonly:2x4xf32>, !flow.dispatch.tensor<writeonly:1x2x4xf32>,
-        !flow.dispatch.tensor<writeonly:2x4xf32>) -> ()}
-    module {
-      func @reshape_interleaved() {
-        %0 = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<2x4xf32>
-        %1 = iree.placeholder for "interface buffer" {binding = @io::@ret1} : memref<1x2x4xf32>
-        %2 = iree.placeholder for "interface buffer" {binding = @io::@arg0} : memref<2x4xf32>
-        linalg.generic {indexing_maps = [#map0, #map0],
-                        iterator_types = ["parallel", "parallel"]}
-          ins(%2 : memref<2x4xf32>)
-         outs(%0 : memref<2x4xf32>) {
-        ^bb0(%arg0: f32, %arg1: f32):  // no predecessors
-          %4 = math.tanh %arg0 : f32
-          linalg.yield %4 : f32
-        }
-        %3 = linalg.reshape %0 [#map1, #map2] : memref<2x4xf32> into memref<1x2x4xf32>
-        linalg.copy(%3, %1) : memref<1x2x4xf32>, memref<1x2x4xf32>
-        return
-      }
-      hal.interface @io attributes {sym_visibility = "private"} {
-        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-        hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
-        hal.interface.binding @ret1, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-      }
-    }
-  }
-}
-
-//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2)>
-//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1) -> (d0, d1)>
-//  CHECK-DAG: hal.executable.entry_point @reshape_interleaved_dispatch_0
-//  CHECK-DAG: hal.executable.entry_point @reshape_interleaved_dispatch_1
-//      CHECK: module attributes {hal.entry_point_schedule =
-// CHECK-SAME:   [@reshape_interleaved_dispatch_0, @reshape_interleaved_dispatch_1]}
-//      CHECK: func @reshape_interleaved_dispatch_1()
-//      CHECK:   %[[SRC1:.+]] = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<2x4xf32>
-//      CHECK:   %[[DST:.+]] = iree.placeholder for "interface buffer" {binding = @io::@ret1} : memref<1x2x4xf32>
-//      CHECK:   %[[SRC2:.+]] = linalg.reshape %[[SRC1]] [#[[MAP0]], #[[MAP1]]] : memref<2x4xf32> into memref<1x2x4xf32>
-//      CHECK:   linalg.copy(%[[SRC2]], %[[DST]]) : memref<1x2x4xf32>, memref<1x2x4xf32>
-//      CHECK:   return
-//      CHECK: func @reshape_interleaved_dispatch_0()
-//      CHECK:   %[[OUT:.+]] = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<2x4xf32>
-//      CHECK:   %[[IN:.+]] = iree.placeholder for "interface buffer" {binding = @io::@arg0} : memref<2x4xf32>
-//      CHECK:   linalg.generic
-// CHECK-SAME:     ins(%[[IN]] :
-// CHECK-SAME:    outs(%[[OUT]] :
-
-// -----
-
-hal.executable @predict_ex_dispatch_0 attributes {sym_visiblity = "private"} {
-  hal.interface @io {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-    hal.interface.binding @ret1, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
-  }
-  hal.executable.target @vulkan, filter="vulkan*" {
-    hal.executable.entry_point @predict_ex_dispatch_0 attributes {
-      interface = @io, ordinal = 0 : index,
-      signature = (!flow.dispatch.tensor<readonly:1x512x1xf32>, !flow.dispatch.tensor<readonly:4x8x16xf32>,
-        !flow.dispatch.tensor<writeonly:4x8x16xf32>, !flow.dispatch.tensor<writeonly:4x8x16xf32>) -> ()}
-    module {
-      func @predict_ex_dispatch_0() {
-        %0 = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<1x512x1xf32>
-        %1 = iree.placeholder for "interface buffer" {binding = @io::@ret1} : memref<4x8x16xf32>
-        %2 = iree.placeholder for "interface buffer" {binding = @io::@arg0} : memref<1x512x1xf32>
-        linalg.copy(%2, %0) : memref<1x512x1xf32>, memref<1x512x1xf32>
-        %3 = iree.placeholder for "interface buffer" {binding = @io::@arg1} : memref<4x8x16xf32>
-        linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (-d0 + 3, d1, d2)>,
-                                         affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
-                        iterator_types = ["parallel", "parallel", "parallel"]}
-          ins(%3 : memref<4x8x16xf32>)
-         outs(%1 : memref<4x8x16xf32>) {
-        ^bb0(%arg0: f32, %arg1: f32):  // no predecessors
-          linalg.yield %arg0 : f32
-        }
-        return
-      }
-      hal.interface @io attributes {push_constants = 1 : index, sym_visibility = "private"} {
-        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-        hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-        hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-      }
-    }
-  }
-}
-//  CHECK-DAG: hal.executable.entry_point @predict_ex_dispatch_0_dispatch_0
-//  CHECK-DAG: hal.executable.entry_point @predict_ex_dispatch_0_dispatch_1
-//      CHECK: module attributes {hal.entry_point_schedule =
-// CHECK-SAME:   [@predict_ex_dispatch_0_dispatch_0, @predict_ex_dispatch_0_dispatch_1]}
-//      CHECK: func @predict_ex_dispatch_0_dispatch_1
-// CHECK-NEXT:   iree.placeholder
-// CHECK-SAME:     binding = @io::@ret1
-// CHECK-NEXT:   iree.placeholder
-// CHECK-SAME:     binding = @io::@arg1
-// CHECK-NEXT:   linalg.generic
-//      CHECK:     linalg.yield
-//  CHECK-NOT:   linalg
-//      CHECK:   return
-//      CHECK: func @predict_ex_dispatch_0_dispatch_0
-// CHECK-NEXT:   iree.placeholder
-// CHECK-SAME:     binding = @io::@ret0
-// CHECK-NEXT:   iree.placeholder
-// CHECK-SAME:     binding = @io::@arg0
-// CHECK-NEXT:   linalg.copy
-//  CHECK-NOT:   linalg
-//      CHECK:   return
-
-// -----
-
-hal.executable @kernel_fusable_fill_matmul_generic_ops attributes {sym_visiblity = "private"} {
-  hal.interface @io {
-    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-    hal.interface.binding @arg2, set=0, binding=2, type="StorageBuffer", access="Read"
-    hal.interface.binding @ret0, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
-  }
-  hal.executable.target @vulkan, filter="vulkan*" {
-    hal.executable.entry_point @kernel_fusable_fill_matmul_generic_ops attributes {
-      interface = @io, ordinal = 0 : index,
-      signature = (!flow.dispatch.tensor<readonly:?x512xf32>, !flow.dispatch.tensor<readonly:512x?xf32>,
-        !flow.dispatch.tensor<readonly:?x?xf32>, !flow.dispatch.tensor<writeonly:?x?xf32>) -> ()}
-    module {
-      //     CHECK: func @kernel_fusable_fill_matmul_generic_ops
-      //     CHECK:   linalg.fill
-      // CHECK-NOT:   return
-      //     CHECK:   linalg.matmul
-      // CHECK-NOT:   return
-      //     CHECK:   linalg.generic
-      //     CHECK:   return
-
-      func @kernel_fusable_fill_matmul_generic_ops() {
-        %cst = constant 0.000000e+00 : f32
-        %dimM = hal.interface.load.constant offset = 0 : index
-        %dimN = hal.interface.load.constant offset = 1 : index
-        %shape1 = shapex.make_ranked_shape %dimM : (index) -> !shapex.ranked_shape<[?,512]>
-        %shape2 = shapex.make_ranked_shape %dimN : (index) -> !shapex.ranked_shape<[512,?]>
-        %shape3 = shapex.make_ranked_shape %dimM, %dimN : (index, index) -> !shapex.ranked_shape<[?,?]>
-        %0 = iree.placeholder for "interface buffer" {binding = @io::@arg0} : memref<?x512xf32>
-        %ts0 = shapex.tie_shape %0, %shape1 : memref<?x512xf32>, !shapex.ranked_shape<[?,512]>
-        %1 = iree.placeholder for "interface buffer" {binding = @io::@arg1} : memref<512x?xf32>
-        %ts1 = shapex.tie_shape %1, %shape2 : memref<512x?xf32>, !shapex.ranked_shape<[512, ?]>
-        %2 = iree.placeholder for "interface buffer" {binding = @io::@arg2} : memref<?x?xf32>
-        %ts2 = shapex.tie_shape %2, %shape3 : memref<?x?xf32>, !shapex.ranked_shape<[?, ?]>
-        %3 = iree.placeholder for "interface buffer" {binding = @io::@ret0} : memref<?x?xf32>
-        %ts3 = shapex.tie_shape %3, %shape3 : memref<?x?xf32>, !shapex.ranked_shape<[?,?]>
-        %4 = memref.alloc(%dimM, %dimN) : memref<?x?xf32>
-        %ts4 = shapex.tie_shape %4, %shape3 : memref<?x?xf32>, !shapex.ranked_shape<[?,?]>
-        linalg.fill(%ts4, %cst) : memref<?x?xf32>, f32
-        linalg.matmul ins(%ts0, %ts1 : memref<?x512xf32>, memref<512x?xf32>)
-                      outs(%ts4 : memref<?x?xf32>)
-        linalg.generic
-          {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                            affine_map<(d0, d1) -> (d0, d1)>,
-                            affine_map<(d0, d1) -> (d0, d1)>],
-           iterator_types = ["parallel", "parallel"]}
-          ins(%ts2, %ts4 : memref<?x?xf32>, memref<?x?xf32>)
-          outs(%ts3 : memref<?x?xf32>) {
-          ^bb0(%arg0 : f32, %arg1 : f32, %arg2 : f32):
-            %5 = addf %arg0, %arg1 : f32
-            linalg.yield %5 : f32
-        }
-        return
-      }
-      hal.interface @io attributes {push_constants = 1 : index, sym_visibility = "private"} {
-        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-        hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-        hal.interface.binding @arg2, set=0, binding=1, type="StorageBuffer", access="Read"
-        hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-      }
-    }
-  }
-}

diff --git a/iree/compiler/Conversion/LinalgToSPIRV/test/tile_and_vectorize_conv.mlir b/iree/compiler/Conversion/LinalgToSPIRV/test/tile_and_vectorize_conv.mlir
index 2a821bc..3eeae63 100644
--- a/iree/compiler/Conversion/LinalgToSPIRV/test/tile_and_vectorize_conv.mlir
+++ b/iree/compiler/Conversion/LinalgToSPIRV/test/tile_and_vectorize_conv.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt -split-input-file -pass-pipeline="hal.executable(hal.executable.target(iree-spirv-concretize-tile-among-workgroups,iree-spirv-tile-and-vectorize-in-one-workgroup))" -iree-spirv-enable-vectorization -iree-codegen-spirv-experimental-linalg-on-tensors -canonicalize -cse %s | IreeFileCheck %s
+// RUN: iree-opt -split-input-file -pass-pipeline="hal.executable(hal.executable.target(iree-spirv-concretize-tile-among-workgroups,iree-spirv-tile-and-vectorize-in-one-workgroup))" -canonicalize -cse %s | IreeFileCheck %s
 
 hal.executable @conv_static_shape_f32 attributes {sym_visibility = "private"} {
   hal.interface @io {

diff --git a/iree/compiler/Conversion/init_conversions.h b/iree/compiler/Conversion/init_conversions.h
index 9220eb6..874adf6 100644
--- a/iree/compiler/Conversion/init_conversions.h
+++ b/iree/compiler/Conversion/init_conversions.h

@@ -64,13 +64,10 @@
 inline void registerLinalgToSPIRVPasses() {
   static bool init_once = []() {
     // LinalgToSPIRV
-    createConvertToGPUPass(SPIRVCodegenOptions());
+    createConvertToGPUPass();
     createFoldProcessorIDUsesPass();
-    createTileAndDistributeAmongWorkgroupsPass(SPIRVCodegenOptions());
     createTileAndVectorizeInOneWorkgroupPass(SPIRVCodegenOptions());
-    createSplitDispatchFunctionPass();
     createVectorToGPUPass();
-    createMatMulTileAndVectorizeGPUPass();
     createVectorizeMemrefLoadStorePass();
     return true;
   }();

diff --git a/iree/compiler/Dialect/HAL/Target/LLVM/LLVMAOTTarget.cpp b/iree/compiler/Dialect/HAL/Target/LLVM/LLVMAOTTarget.cpp
index 629a2e5..62418fc 100644
--- a/iree/compiler/Dialect/HAL/Target/LLVM/LLVMAOTTarget.cpp
+++ b/iree/compiler/Dialect/HAL/Target/LLVM/LLVMAOTTarget.cpp

@@ -16,7 +16,6 @@
 
 #include <cstdlib>
 
-#include "iree/compiler/Conversion/Common/Attributes.h"
 #include "iree/compiler/Conversion/LinalgToLLVM/LLVMCodeGenOptions.h"
 #include "iree/compiler/Conversion/LinalgToLLVM/Passes.h"
 #include "iree/compiler/Dialect/HAL/Target/LLVM/LLVMIRPasses.h"

diff --git a/iree/compiler/Dialect/HAL/Target/MetalSPIRV/MetalSPIRVTarget.cpp b/iree/compiler/Dialect/HAL/Target/MetalSPIRV/MetalSPIRVTarget.cpp
index 3bb9b7f..eb684b4 100644
--- a/iree/compiler/Dialect/HAL/Target/MetalSPIRV/MetalSPIRVTarget.cpp
+++ b/iree/compiler/Dialect/HAL/Target/MetalSPIRV/MetalSPIRVTarget.cpp

@@ -14,7 +14,6 @@
 
 #include "iree/compiler/Dialect/HAL/Target/MetalSPIRV/MetalSPIRVTarget.h"
 
-#include "iree/compiler/Conversion/Common/Attributes.h"
 #include "iree/compiler/Dialect/HAL/Target/MetalSPIRV/SPIRVToMSL.h"
 #include "iree/compiler/Dialect/HAL/Target/SPIRVCommon/SPIRVTarget.h"
 #include "iree/compiler/Dialect/HAL/Target/TargetRegistry.h"
@@ -79,18 +78,9 @@
     // names for constructing pipeline states. Get an ordered list of the entry
     // point names.
     SmallVector<StringRef, 8> entryPointNames;
-    if (auto scheduleAttr = innerModuleOp->getAttrOfType<ArrayAttr>(
-            iree_compiler::getEntryPointScheduleAttrName())) {
-      // We have multiple entry points in this module. Make sure the order
-      // specified in the schedule attribute is respected.
-      for (Attribute entryPoint : scheduleAttr) {
-        entryPointNames.push_back(entryPoint.cast<StringAttr>().getValue());
-      }
-    } else {
-      spvModuleOp.walk([&](spirv::EntryPointOp entryPointOp) {
-        entryPointNames.push_back(entryPointOp.fn());
-      });
-    }
+    spvModuleOp.walk([&](spirv::EntryPointOp entryPointOp) {
+      entryPointNames.push_back(entryPointOp.fn());
+    });
 
     // 1. Serialize the spirv::ModuleOp into binary format.
     SmallVector<uint32_t, 0> spvBinary;

diff --git a/iree/compiler/Dialect/HAL/Target/SPIRVCommon/SPIRVTarget.cpp b/iree/compiler/Dialect/HAL/Target/SPIRVCommon/SPIRVTarget.cpp
index 65c035d..86dee04 100644
--- a/iree/compiler/Dialect/HAL/Target/SPIRVCommon/SPIRVTarget.cpp
+++ b/iree/compiler/Dialect/HAL/Target/SPIRVCommon/SPIRVTarget.cpp

@@ -14,7 +14,6 @@
 
 #include "iree/compiler/Dialect/HAL/Target/SPIRVCommon/SPIRVTarget.h"
 
-#include "iree/compiler/Conversion/Common/Attributes.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
 #include "iree/compiler/Dialect/HAL/Target/TargetRegistry.h"
 #include "iree/compiler/Dialect/Shape/IR/ShapeOps.h"
@@ -28,15 +27,6 @@
 namespace IREE {
 namespace HAL {
 
-// Records a full execution barrier that forces visibility of all buffers.
-static void recordFullExecutionBarrier(Value commandBuffer, Location loc,
-                                       OpBuilder &builder) {
-  builder.create<IREE::HAL::CommandBufferExecutionBarrierOp>(
-      loc, commandBuffer, IREE::HAL::ExecutionStageBitfield::Dispatch,
-      IREE::HAL::ExecutionStageBitfield::Dispatch,
-      IREE::HAL::ExecutionBarrierFlagBitfield::None);
-}
-
 SPIRVTargetBackend::SPIRVTargetBackend(SPIRVCodegenOptions options)
     : spvCodeGenOptions_(std::move(options)) {}
 
@@ -61,170 +51,6 @@
   buildSPIRVTransformPassPipeline(passManager, spvCodeGenOptions_);
 }
 
-LogicalResult SPIRVTargetBackend::recordDispatch(
-    Location loc, DispatchState dispatchState,
-    DeviceSwitchRewriter &switchRewriter) {
-  // TODO(#4140): remove this legacy path when linalg-on-tensors is used.
-  // In the linalg-on-tensors world where we are performing the tiling logic
-  // in the flow dialect we don't even really need the ability to override
-  // dispatch recording at all - just a way to allow targets to map workgroup
-  // counts from the N-dimensional flow workgroup counts to the 3D hal counts.
-  if (dispatchState.workgroupCount.size() == 3) {
-    return TargetBackend::recordDispatch(loc, dispatchState, switchRewriter);
-  }
-
-  // Multiple entry points might be generated for a single dispatch function.
-  // Under such circumstances, we will have a special attribute indicating the
-  // schedule of the split entry points. Try to see if we can find such
-  // schedule attribute first.
-  ArrayAttr entryPointScheduleAttr;
-  spirv::ModuleOp spvModuleOp;
-  IREE::HAL::ExecutableOp executableOp = dispatchState.executableOp;
-  for (auto executableTargetOp :
-       executableOp.getBlock().getOps<IREE::HAL::ExecutableTargetOp>()) {
-    if (matchPattern(executableTargetOp.target_backend_filter(),
-                     filter_pattern())) {
-      ModuleOp innerModuleOp = executableTargetOp.getInnerModule();
-      auto spvModuleOps = innerModuleOp.getOps<spirv::ModuleOp>();
-      assert(llvm::hasSingleElement(spvModuleOps));
-      spvModuleOp = *spvModuleOps.begin();
-      entryPointScheduleAttr = innerModuleOp->getAttrOfType<ArrayAttr>(
-          iree_compiler::getEntryPointScheduleAttrName());
-      if (!spvModuleOp)
-        return executableOp.emitError("unable to find spv.module");
-
-      SmallVector<IREE::HAL::ExecutableEntryPointOp, 2> entryPoints;
-      if (!entryPointScheduleAttr) {
-        entryPoints = llvm::to_vector<2>(
-            executableTargetOp.getOps<IREE::HAL::ExecutableEntryPointOp>());
-        if (!llvm::hasSingleElement(entryPoints)) {
-          return executableTargetOp.emitError(
-                     "expected a single entry point, found ")
-                 << entryPoints.size();
-        }
-      } else {
-        SymbolTable symTable(executableTargetOp);
-        for (Attribute entryPointAttr : entryPointScheduleAttr) {
-          auto entryPointOp =
-              symTable.lookup<IREE::HAL::ExecutableEntryPointOp>(
-                  entryPointAttr.cast<FlatSymbolRefAttr>().getValue());
-          if (!entryPointOp) {
-            return executableTargetOp.emitError(
-                       "unable to find hal.executable.entry_point operation "
-                       "for ")
-                   << entryPointAttr.cast<FlatSymbolRefAttr>().getValue();
-          }
-          entryPoints.push_back(entryPointOp);
-        }
-      }
-
-      auto *region = switchRewriter.addConditionRegion(
-          IREE::HAL::DeviceMatchIDAttr::get(filter_pattern(), loc.getContext()),
-          {
-              dispatchState.workgroupCount[0],
-              dispatchState.commandBuffer,
-          });
-
-      auto &entryBlock = region->front();
-      ConversionPatternRewriter &rewriter = switchRewriter.getRewriter();
-      OpBuilder::InsertionGuard guard(rewriter);
-      rewriter.setInsertionPointToEnd(&entryBlock);
-      auto workload = entryBlock.getArgument(0);
-      auto commandBuffer = entryBlock.getArgument(1);
-
-      // We have multiple entry points to dispatch. Record in the order
-      // specified by entry point schedule and insert barrier between sequential
-      // ones.
-      for (auto entryPoint : llvm::enumerate(entryPoints)) {
-        std::array<Value, 3> workgroupCount = calculateDispatchWorkgroupCount(
-            loc, executableOp, entryPoint.value(), workload, rewriter);
-
-        if (llvm::any_of(workgroupCount,
-                         [](Value v) -> bool { return v == nullptr; })) {
-          return entryPoint.value().emitError("unable to find workgroup count");
-        }
-
-        // Ordinals are fixed based on the precomputed schedule, so use
-        // CommandBufferDispatchOp instead of CommandBufferDispatchSymbolOp.
-        auto executable = rewriter
-                              .create<IREE::HAL::ExecutableLookupOp>(
-                                  loc, dispatchState.device,
-                                  dispatchState.dispatchOp.executable())
-                              .getResult();
-        int32_t entryPointOrdinal = entryPoint.index();
-        rewriter.create<IREE::HAL::CommandBufferDispatchOp>(
-            loc, commandBuffer, executable,
-            rewriter.getIndexAttr(entryPointOrdinal), workgroupCount[0],
-            workgroupCount[1], workgroupCount[2]);
-        if (entryPoint.index() + 1 != entryPoints.size()) {
-          recordFullExecutionBarrier(commandBuffer, loc, rewriter);
-        }
-      }
-      rewriter.create<IREE::HAL::ReturnOp>(loc);
-    }
-  }
-  return success();
-}
-
-// Finds the spv.ExecutionMode operation to get the workgroup size from.
-// TODO(ravishankarm): This might not be the only way this is specified. You
-// could also have a spec constant, but that is not generated in the
-// spv.module right now.
-// TODO(ravishankarm): change workgroup size calculation to something we can
-// query independently so that we don't need to lookup the value here.
-std::array<Value, 3> SPIRVTargetBackend::calculateDispatchWorkgroupSize(
-    Location loc, IREE::HAL::ExecutableOp executableOp,
-    IREE::HAL::ExecutableEntryPointOp entryPointOp, ValueRange workload,
-    OpBuilder &builder) {
-  // TODO(ravishankarm): possibly emit different recordDispatch logic if the
-  // workgroup sizes differ among targets.
-  spirv::ModuleOp spvModuleOp;
-  for (auto executableTargetOp :
-       executableOp.getBlock().getOps<IREE::HAL::ExecutableTargetOp>()) {
-    if (matchPattern(executableTargetOp.target_backend_filter(),
-                     filter_pattern())) {
-      ModuleOp innerModuleOp = executableTargetOp.getInnerModule();
-      assert(!innerModuleOp->getAttr(
-          iree_compiler::getEntryPointScheduleAttrName()));
-      auto spvModuleOps = innerModuleOp.getOps<spirv::ModuleOp>();
-      assert(llvm::hasSingleElement(spvModuleOps));
-      spvModuleOp = *spvModuleOps.begin();
-      break;
-    }
-  }
-  return calculateDispatchWorkgroupSize(
-      loc, spvModuleOp, entryPointOp.sym_name(), workload, builder);
-}
-
-std::array<Value, 3> SPIRVTargetBackend::calculateDispatchWorkgroupSize(
-    Location loc, spirv::ModuleOp spvModuleOp, StringRef entryPointName,
-    ValueRange workload, OpBuilder &builder) {
-  std::array<Value, 3> workgroupSize;
-  for (auto executionModeOp :
-       spvModuleOp.getBlock().getOps<spirv::ExecutionModeOp>()) {
-    if (executionModeOp.fn() == entryPointName &&
-        executionModeOp.execution_mode() == spirv::ExecutionMode::LocalSize) {
-      for (int i = 0; i < executionModeOp.values().size(); ++i) {
-        workgroupSize[i] =
-            builder.create<ConstantIndexOp>(loc, executionModeOp.values()[i]
-                                                     .cast<IntegerAttr>()
-                                                     .getValue()
-                                                     .getZExtValue());
-      }
-      break;
-    }
-  }
-
-  // Pad out the workgroup size with 1's (if the original rank was < 3).
-  for (int i = 0; i < workgroupSize.size(); ++i) {
-    if (!workgroupSize[i]) {
-      workgroupSize[i] = builder.create<ConstantIndexOp>(loc, 1);
-    }
-  }
-
-  return workgroupSize;
-}
-
 }  // namespace HAL
 }  // namespace IREE
 }  // namespace iree_compiler

diff --git a/iree/compiler/Dialect/HAL/Target/SPIRVCommon/SPIRVTarget.h b/iree/compiler/Dialect/HAL/Target/SPIRVCommon/SPIRVTarget.h
index ea3389c..9378c56 100644
--- a/iree/compiler/Dialect/HAL/Target/SPIRVCommon/SPIRVTarget.h
+++ b/iree/compiler/Dialect/HAL/Target/SPIRVCommon/SPIRVTarget.h

@@ -38,20 +38,6 @@
 
   void buildTranslationPassPipeline(OpPassManager &passManager) override;
 
-  LogicalResult recordDispatch(Location loc, DispatchState dispatchState,
-                               DeviceSwitchRewriter &switchRewriter) override;
-
-  // Finds the spv.ExecutionMode operation to get the workgroup size from.
-  std::array<Value, 3> calculateDispatchWorkgroupSize(
-      Location loc, IREE::HAL::ExecutableOp executableOp,
-      IREE::HAL::ExecutableEntryPointOp entryPointOp, ValueRange workload,
-      OpBuilder &builder) override;
-
- private:
-  std::array<Value, 3> calculateDispatchWorkgroupSize(
-      Location loc, spirv::ModuleOp spvModuleOp, StringRef entryPointName,
-      ValueRange workload, OpBuilder &builder);
-
   SPIRVCodegenOptions spvCodeGenOptions_;
 };
 

diff --git a/iree/compiler/Dialect/HAL/Target/VulkanSPIRV/VulkanSPIRVTarget.cpp b/iree/compiler/Dialect/HAL/Target/VulkanSPIRV/VulkanSPIRVTarget.cpp
index e1a556e..30499ab 100644
--- a/iree/compiler/Dialect/HAL/Target/VulkanSPIRV/VulkanSPIRVTarget.cpp
+++ b/iree/compiler/Dialect/HAL/Target/VulkanSPIRV/VulkanSPIRVTarget.cpp

@@ -14,7 +14,6 @@
 
 #include "iree/compiler/Dialect/HAL/Target/VulkanSPIRV/VulkanSPIRVTarget.h"
 
-#include "iree/compiler/Conversion/Common/Attributes.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/CodeGenOptionUtils.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
 #include "iree/compiler/Dialect/HAL/Target/SPIRVCommon/SPIRVTarget.h"
@@ -144,19 +143,9 @@
     // list of entry point names here that are then passed in
     // VkShaderModuleCreateInfo.
     SmallVector<StringRef, 8> entryPointNames;
-    if (auto scheduleAttr = innerModuleOp->getAttrOfType<ArrayAttr>(
-            iree_compiler::getEntryPointScheduleAttrName())) {
-      // We have multiple entry points in this module. Make sure the order
-      // specified in the schedule attribute is respected.
-      for (Attribute entryPoint : scheduleAttr) {
-        entryPointNames.push_back(
-            entryPoint.cast<FlatSymbolRefAttr>().getValue());
-      }
-    } else {
-      spvModuleOp.walk([&](spirv::EntryPointOp entryPointOp) {
-        entryPointNames.push_back(entryPointOp.fn());
-      });
-    }
+    spvModuleOp.walk([&](spirv::EntryPointOp entryPointOp) {
+      entryPointNames.push_back(entryPointOp.fn());
+    });
     auto entryPointsRef = builder.createStringVec(entryPointNames);
 
     iree_SpirVExecutableDef_entry_points_add(builder, entryPointsRef);

diff --git a/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.cpp b/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.cpp
index 4bfa50a..e611f52 100644
--- a/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.cpp
+++ b/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.cpp

@@ -268,13 +268,11 @@
     Value listOperand = op.getOperation()->getOperand(listArgumentIndex);
 
     // deref
-    auto refOp = rewriter.create<emitc::CallOp>(
+    auto refOp = rewriter.create<emitc::ApplyOp>(
         /*location=*/loc,
         /*type=*/emitc::OpaqueType::get(ctx, "iree_vm_ref_t"),
-        /*callee=*/rewriter.getStringAttr("*"),
-        /*args=*/ArrayAttr{},
-        /*templateArgs=*/ArrayAttr{},
-        /*operands=*/ArrayRef<Value>{listOperand});
+        /*applicableOperator=*/rewriter.getStringAttr("*"),
+        /*operand=*/listOperand);
 
     auto listDerefOp = rewriter.create<emitc::CallOp>(
         /*location=*/loc,
@@ -282,7 +280,7 @@
         /*callee=*/rewriter.getStringAttr("iree_vm_list_deref"),
         /*args=*/ArrayAttr{},
         /*templateArgs=*/ArrayAttr{},
-        /*operands=*/ArrayRef<Value>{refOp.getResult(0)});
+        /*operands=*/ArrayRef<Value>{refOp.getResult()});
 
     rewriter.create<emitc::CallOp>(
         /*location=*/loc,
@@ -386,9 +384,10 @@
         /*templateArgs=*/ArrayAttr{},
         /*operands=*/ArrayRef<Value>{});
 
-    auto elementTypePtrOp = rewriter.create<emitc::GetAddressOfOp>(
+    auto elementTypePtrOp = rewriter.create<emitc::ApplyOp>(
         /*location=*/loc,
         /*result=*/emitc::OpaqueType::get(ctx, "iree_vm_type_def_t*"),
+        /*applicableOperator=*/rewriter.getStringAttr("&"),
         /*operand=*/elementTypeOp.getResult(0));
 
     auto listOp = rewriter.create<emitc::ConstOp>(
@@ -396,9 +395,10 @@
         /*resultType=*/emitc::OpaqueType::get(ctx, "iree_vm_list_t*"),
         /*value=*/StringAttr::get(ctx, "NULL"));
 
-    auto listPtrOp = rewriter.create<emitc::GetAddressOfOp>(
+    auto listPtrOp = rewriter.create<emitc::ApplyOp>(
         /*location=*/loc,
         /*result=*/emitc::OpaqueType::get(ctx, "iree_vm_list_t**"),
+        /*applicableOperator=*/rewriter.getStringAttr("&"),
         /*operand=*/listOp.getResult());
 
     failableCall(
@@ -497,18 +497,17 @@
         /*resultType=*/emitc::OpaqueType::get(ctx, "iree_vm_value_t"),
         /*value=*/StringAttr::get(ctx, ""));
 
-    auto valuePtrOp = rewriter.create<emitc::GetAddressOfOp>(
+    auto valuePtrOp = rewriter.create<emitc::ApplyOp>(
         /*location=*/loc,
         /*result=*/emitc::OpaqueType::get(ctx, "iree_vm_value_t*"),
+        /*applicableOperator=*/rewriter.getStringAttr("&"),
         /*operand=*/valueOp.getResult());
 
-    auto refOp = rewriter.create<emitc::CallOp>(
+    auto refOp = rewriter.create<emitc::ApplyOp>(
         /*location=*/loc,
         /*type=*/emitc::OpaqueType::get(ctx, "iree_vm_ref_t"),
-        /*callee=*/rewriter.getStringAttr("*"),
-        /*args=*/ArrayAttr{},
-        /*templateArgs=*/ArrayAttr{},
-        /*operands=*/ArrayRef<Value>{getOp.list()});
+        /*applicableOperator=*/rewriter.getStringAttr("*"),
+        /*operand=*/getOp.list());
 
     auto listDerefOp = rewriter.create<emitc::CallOp>(
         /*location=*/loc,
@@ -516,7 +515,7 @@
         /*callee=*/rewriter.getStringAttr("iree_vm_list_deref"),
         /*args=*/ArrayAttr{},
         /*templateArgs=*/ArrayAttr{},
-        /*operands=*/ArrayRef<Value>{refOp.getResult(0)});
+        /*operands=*/ArrayRef<Value>{refOp.getResult()});
 
     rewriter.create<emitc::CallOp>(
         /*location=*/loc,
@@ -584,18 +583,17 @@
         /*templateArgs=*/ArrayAttr{},
         /*operands=*/ArrayRef<Value>{setOp.value()});
 
-    auto valuePtrOp = rewriter.create<emitc::GetAddressOfOp>(
+    auto valuePtrOp = rewriter.create<emitc::ApplyOp>(
         /*location=*/loc,
         /*result=*/emitc::OpaqueType::get(ctx, "iree_vm_value_t*"),
+        /*applicableOperator=*/rewriter.getStringAttr("&"),
         /*operand=*/valueOp.getResult(0));
 
-    auto refOp = rewriter.create<emitc::CallOp>(
+    auto refOp = rewriter.create<emitc::ApplyOp>(
         /*location=*/loc,
         /*type=*/emitc::OpaqueType::get(ctx, "iree_vm_ref_t"),
-        /*callee=*/rewriter.getStringAttr("*"),
-        /*args=*/ArrayAttr{},
-        /*templateArgs=*/ArrayAttr{},
-        /*operands=*/ArrayRef<Value>{setOp.list()});
+        /*applicableOperator=*/rewriter.getStringAttr("*"),
+        /*operand=*/setOp.list());
 
     auto listDerefOp = rewriter.create<emitc::CallOp>(
         /*location=*/loc,
@@ -603,7 +601,7 @@
         /*callee=*/rewriter.getStringAttr("iree_vm_list_deref"),
         /*args=*/ArrayAttr{},
         /*templateArgs=*/ArrayAttr{},
-        /*operands=*/ArrayRef<Value>{refOp.getResult(0)});
+        /*operands=*/ArrayRef<Value>{refOp.getResult()});
 
     rewriter.create<emitc::CallOp>(
         /*location=*/loc,

diff --git a/iree/compiler/Dialect/Vulkan/Utils/test/target_env_conversion.mlir b/iree/compiler/Dialect/Vulkan/Utils/test/target_env_conversion.mlir
index cd85e9d..f8da971 100644
--- a/iree/compiler/Dialect/Vulkan/Utils/test/target_env_conversion.mlir
+++ b/iree/compiler/Dialect/Vulkan/Utils/test/target_env_conversion.mlir

@@ -1,7 +1,7 @@
-// RUN: iree-opt -iree-codegen-spirv-experimental-linalg-on-tensors -pass-pipeline='iree-hal-transformation-pipeline{serialize-executables=false}' -iree-hal-target-backends=vulkan-spirv %s | IreeFileCheck %s -check-prefix=DEFAULT
-// RUN: iree-opt -iree-codegen-spirv-experimental-linalg-on-tensors -pass-pipeline='iree-hal-transformation-pipeline{serialize-executables=false}' -iree-hal-target-backends=vulkan-spirv -iree-vulkan-target-triple=qualcomm-adreno640-unknown-android10 %s | IreeFileCheck %s -check-prefix=ADRENO640
-// RUN: iree-opt -iree-codegen-spirv-experimental-linalg-on-tensors -pass-pipeline='iree-hal-transformation-pipeline{serialize-executables=false}' -iree-hal-target-backends=vulkan-spirv -iree-vulkan-target-triple=valhall-g77-unknown-android10 %s | IreeFileCheck %s -check-prefix=MALIG77
-// RUN: iree-opt -iree-codegen-spirv-experimental-linalg-on-tensors -pass-pipeline='iree-hal-transformation-pipeline{serialize-executables=false}' -iree-hal-target-backends=vulkan-spirv -iree-vulkan-target-triple=turing-t4-unknown-linux %s | IreeFileCheck %s -check-prefix=TURINGT4
+// RUN: iree-opt -pass-pipeline='iree-hal-transformation-pipeline{serialize-executables=false}' -iree-hal-target-backends=vulkan-spirv %s | IreeFileCheck %s -check-prefix=DEFAULT
+// RUN: iree-opt -pass-pipeline='iree-hal-transformation-pipeline{serialize-executables=false}' -iree-hal-target-backends=vulkan-spirv -iree-vulkan-target-triple=qualcomm-adreno640-unknown-android10 %s | IreeFileCheck %s -check-prefix=ADRENO640
+// RUN: iree-opt -pass-pipeline='iree-hal-transformation-pipeline{serialize-executables=false}' -iree-hal-target-backends=vulkan-spirv -iree-vulkan-target-triple=valhall-g77-unknown-android10 %s | IreeFileCheck %s -check-prefix=MALIG77
+// RUN: iree-opt -pass-pipeline='iree-hal-transformation-pipeline{serialize-executables=false}' -iree-hal-target-backends=vulkan-spirv -iree-vulkan-target-triple=turing-t4-unknown-linux %s | IreeFileCheck %s -check-prefix=TURINGT4
 
 // TODO(antiagainst): Passing in lenghty strings as command-line options is not
 // optimal. We should consider creating a dedicated test pass to pick up

diff --git a/iree/hal/local/elf/platform/apple.c b/iree/hal/local/elf/platform/apple.c
index 78b7acb..ae7aeeb 100644
--- a/iree/hal/local/elf/platform/apple.c
+++ b/iree/hal/local/elf/platform/apple.c

@@ -29,6 +29,17 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
+// MAP_JIT and related utilities are only available on MacOS 11.0+.
+#if defined(MAC_OS_VERSION_11_0) && \
+    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_11_0
+#define IREE_APPLE_IF_AT_LEAST_MAC_OS_11_0(expr) \
+  if (__builtin_available(macOS 11.0, *)) {      \
+    expr                                         \
+  }
+#else
+#define IREE_APPLE_IF_AT_LEAST_MAC_OS_11_0(expr)
+#endif  // MAC_OS_VERSION_11_0
+
 //==============================================================================
 // Memory subsystem information and control
 //==============================================================================
@@ -44,9 +55,21 @@
   out_info->can_allocate_executable_pages = true;
 }
 
-void iree_memory_jit_context_begin() { pthread_jit_write_protect_np(0); }
+void iree_memory_jit_context_begin() {
+  IREE_APPLE_IF_AT_LEAST_MAC_OS_11_0({
+    if (pthread_jit_write_protect_supported_np()) {
+      pthread_jit_write_protect_np(0);
+    }
+  });
+}
 
-void iree_memory_jit_context_end() { pthread_jit_write_protect_np(1); }
+void iree_memory_jit_context_end() {
+  IREE_APPLE_IF_AT_LEAST_MAC_OS_11_0({
+    if (pthread_jit_write_protect_supported_np()) {
+      pthread_jit_write_protect_np(1);
+    }
+  });
+}
 
 //==============================================================================
 // Virtual address space manipulation
@@ -71,9 +94,11 @@
 
   int mmap_prot = PROT_NONE;
   int mmap_flags = MAP_PRIVATE | MAP_ANON | MAP_NORESERVE;
-  if (flags & IREE_MEMORY_VIEW_FLAG_EXECUTE) {
-    mmap_flags |= MAP_JIT;
-  }
+  IREE_APPLE_IF_AT_LEAST_MAC_OS_11_0({
+    if (flags & IREE_MEMORY_VIEW_FLAG_MAY_EXECUTE) {
+      mmap_flags |= MAP_JIT;
+    }
+  });
 
   iree_status_t status = iree_ok_status();
   void* base_address =

diff --git a/third_party/mlir-emitc b/third_party/mlir-emitc
index 3c265bf..679d718 160000
--- a/third_party/mlir-emitc
+++ b/third_party/mlir-emitc

@@ -1 +1 @@
-Subproject commit 3c265bf59bf2515a63ec35571c66954349749a62
+Subproject commit 679d7183b657a24f48d16de1fcefb20d7cd1f6a2