Merge pull request #7131 from rsuderman:main-to-google

PiperOrigin-RevId: 398345005
diff --git a/.gitignore b/.gitignore
index 6fe9bf2..20f215e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 .pytype/
 
 # Visual Studio files
+.env
 .vs/
 .vscode/
 *.sdf
diff --git a/CMakeLists.txt b/CMakeLists.txt
index de032a8..00a617a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,6 +81,8 @@
   option(IREE_BUILD_PYTHON_BINDINGS "Builds the IREE python bindings" OFF)
 endif()
 
+option(IREE_BUILD_OLD_PYTHON_COMPILER_API "Builds the original Python compiler API" ON)
+
 #-------------------------------------------------------------------------------
 # Experimental project flags
 #-------------------------------------------------------------------------------
@@ -438,7 +440,7 @@
   enable_testing(iree)
 endif()
 
-if(IREE_BUILD_PYTHON_BINDINGS)
+if(IREE_BUILD_PYTHON_BINDINGS AND NOT pybind11_FOUND)
   add_subdirectory(third_party/pybind11 EXCLUDE_FROM_ALL)
 endif()
 
diff --git a/bindings/python/CMakeLists.txt b/bindings/python/CMakeLists.txt
index b80db7b..92a2475 100644
--- a/bindings/python/CMakeLists.txt
+++ b/bindings/python/CMakeLists.txt
@@ -13,7 +13,7 @@
 add_subdirectory(iree/runtime)
 add_subdirectory(iree/jax)
 
-if(${IREE_BUILD_COMPILER})
+if(IREE_BUILD_COMPILER AND IREE_BUILD_OLD_PYTHON_COMPILER_API)
 add_subdirectory(iree/compiler)
 add_subdirectory(iree/tools/core)
 endif()
diff --git a/bindings/python/iree/runtime/system_api.py b/bindings/python/iree/runtime/system_api.py
index 3731781..fc82a3b 100644
--- a/bindings/python/iree/runtime/system_api.py
+++ b/bindings/python/iree/runtime/system_api.py
@@ -339,8 +339,7 @@
   if backend is not None:
     driver = TARGET_BACKEND_TO_DRIVER[backend]
   vm_module = _binding.VmModule.from_flatbuffer(vm_flatbuffer)
-  config = Config(TARGET_BACKEND_TO_DRIVER[backend])
-  bound_module = load_vm_module(vm_module, config)
+  bound_module = load_vm_module(vm_module, Config(driver))
   return bound_module
 
 
diff --git a/build_tools/bazel/iree.bazelrc b/build_tools/bazel/iree.bazelrc
index cc6ddfb..e987141 100644
--- a/build_tools/bazel/iree.bazelrc
+++ b/build_tools/bazel/iree.bazelrc
@@ -260,7 +260,7 @@
 # specific docker container the TF build is run in. The image URL is included
 # for clarity and so that this reference is automatically updated by
 # manage_images.py
-build:remote_cache_tf_integrations --host_platform_remote_properties_override='properties:{name:"cache-silo-key" value:"gcr.io/iree-oss/cmake-bazel-frontends-swiftshader@sha256:103676490242311b9fad841294689a7ce1c755b935a21d8d898c25cfe3ec15e8"}'
+build:remote_cache_tf_integrations --host_platform_remote_properties_override='properties:{name:"cache-silo-key" value:"gcr.io/iree-oss/cmake-bazel-frontends-swiftshader@sha256:da13c13dc427d9d2bdcba0ee70615088a89b5efd383625d9945eaf2ec17890bd"}'
 
 ### Remote Execution ###
 # --config=rbe Execute and cache builds remotely.
diff --git a/build_tools/buildkite/samples.yml b/build_tools/buildkite/samples.yml
index f4e3d7c..18750f4 100644
--- a/build_tools/buildkite/samples.yml
+++ b/build_tools/buildkite/samples.yml
@@ -7,7 +7,7 @@
 steps:
   - label: "Test Colab notebooks"
     commands:
-      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/samples@sha256:30fe0cbaf16f8a523319e74ad96de7b8d6d5cd49edc76efdcdc34203fa8ea69f python3 colab/test_notebooks.py"
+      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/samples@sha256:a2be19816f8bc7aa417423e8ed29a537350cd7e1c9a6f6daa023fc90656fbced python3 colab/test_notebooks.py"
     env:
       IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
     agents:
@@ -15,7 +15,7 @@
 
   - label: "Test Samples"
     commands:
-      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/samples@sha256:30fe0cbaf16f8a523319e74ad96de7b8d6d5cd49edc76efdcdc34203fa8ea69f build_tools/testing/test_samples.sh"
+      - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/samples@sha256:a2be19816f8bc7aa417423e8ed29a537350cd7e1c9a6f6daa023fc90656fbced build_tools/testing/test_samples.sh"
     env:
       IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
     agents:
diff --git a/build_tools/cmake/iree_macros.cmake b/build_tools/cmake/iree_macros.cmake
index f5049eb..cb90828 100644
--- a/build_tools/cmake/iree_macros.cmake
+++ b/build_tools/cmake/iree_macros.cmake
@@ -215,8 +215,8 @@
       # If this file is included in multiple rules, only create the target once.
       string(REPLACE "::" "_" _DATA_TARGET ${_DATA_LABEL})
       if(NOT TARGET ${_DATA_TARGET})
-        set(_INPUT_PATH "${CMAKE_SOURCE_DIR}/${_FILE_PATH}")
-        set(_OUTPUT_PATH "${CMAKE_BINARY_DIR}/${_FILE_PATH}")
+        set(_INPUT_PATH "${PROJECT_SOURCE_DIR}/${_FILE_PATH}")
+        set(_OUTPUT_PATH "${PROJECT_BINARY_DIR}/${_FILE_PATH}")
         add_custom_target(${_DATA_TARGET}
           COMMAND ${CMAKE_COMMAND} -E copy ${_INPUT_PATH} ${_OUTPUT_PATH}
         )
diff --git a/build_tools/docker/cmake-bazel-frontends-swiftshader/Dockerfile b/build_tools/docker/cmake-bazel-frontends-swiftshader/Dockerfile
index 773acf4..45ba144 100644
--- a/build_tools/docker/cmake-bazel-frontends-swiftshader/Dockerfile
+++ b/build_tools/docker/cmake-bazel-frontends-swiftshader/Dockerfile
@@ -5,7 +5,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 FROM gcr.io/iree-oss/cmake-bazel-frontends-vulkan@sha256:cdf41d7ee7707eb3e79d56f2f1f8bd7e9a0ac3a1122dc4f89f8190154796a6bc AS final
-COPY --from=gcr.io/iree-oss/swiftshader@sha256:3cc8c95c6607891a77dbd01ec06ee4f4bfad26f6ff23a368e44fb6189f51ec02 \
+COPY --from=gcr.io/iree-oss/swiftshader@sha256:1de855baa93acbc2d4dcbf6c5509ecb6d4fa04ea5f5322a1c4faa54ea278e995 \
   /swiftshader /swiftshader
 
 # Set VK_ICD_FILENAMES so Vulkan loader can find the SwiftShader ICD.
diff --git a/build_tools/docker/cmake-python-swiftshader/Dockerfile b/build_tools/docker/cmake-python-swiftshader/Dockerfile
index 45946dd..62fc864 100644
--- a/build_tools/docker/cmake-python-swiftshader/Dockerfile
+++ b/build_tools/docker/cmake-python-swiftshader/Dockerfile
@@ -8,7 +8,7 @@
 # Vulkan implementation.
 
 FROM gcr.io/iree-oss/cmake-python-vulkan@sha256:894d91b6ddd3435f0e5cb4424a81296438252dae4d8934e84aa6be4d02b81343 AS final
-COPY --from=gcr.io/iree-oss/swiftshader@sha256:3cc8c95c6607891a77dbd01ec06ee4f4bfad26f6ff23a368e44fb6189f51ec02 \
+COPY --from=gcr.io/iree-oss/swiftshader@sha256:1de855baa93acbc2d4dcbf6c5509ecb6d4fa04ea5f5322a1c4faa54ea278e995 \
   /swiftshader /swiftshader
 
 # Set VK_ICD_FILENAMES so Vulkan loader can find the SwiftShader ICD.
diff --git a/build_tools/docker/prod_digests.txt b/build_tools/docker/prod_digests.txt
index 579d710..f136d73 100644
--- a/build_tools/docker/prod_digests.txt
+++ b/build_tools/docker/prod_digests.txt
@@ -1,20 +1,20 @@
 gcr.io/iree-oss/base@sha256:9b73f4e2b1239f65a19f2022e54f4b15310b805570831fbe2cf8b4dc928f1d10
 gcr.io/iree-oss/util@sha256:40846b4aea5886af3250399d6adfdb3e1195a8b0177706bb0375e812d62dc49c
 gcr.io/iree-oss/cmake@sha256:9d9953acf5ca0cf1ff3e8de32f10f24dfab1c4e8ec5d1fc047f556024ee4bed6
-gcr.io/iree-oss/swiftshader@sha256:3cc8c95c6607891a77dbd01ec06ee4f4bfad26f6ff23a368e44fb6189f51ec02
+gcr.io/iree-oss/swiftshader@sha256:1de855baa93acbc2d4dcbf6c5509ecb6d4fa04ea5f5322a1c4faa54ea278e995
 gcr.io/iree-oss/cmake-python@sha256:51817f1a98f9ed9237577133b4c674b163280fd747c1745d6d0d93f0f2b01fb3
 gcr.io/iree-oss/cmake-android@sha256:7d780787608474301e74e1b5cc2a1bfd1304a79ed9e0774c7ed422c0e4a38625
 gcr.io/iree-oss/bazel@sha256:5e52c7d43b6fdff35d884b8b8b92b1b6e2151d675019edc92f09018e558e0f94
 gcr.io/iree-oss/vulkan@sha256:5812ee64806a7f3df0739ccf0930c27cabce346901488eceb1ee66c9c0a5ae96
 gcr.io/iree-oss/rbe-toolchain@sha256:62b161e79413f0f59ae3845c377b10e60a4a639f3d32569a82b620f017837a68
 gcr.io/iree-oss/cmake-python-vulkan@sha256:894d91b6ddd3435f0e5cb4424a81296438252dae4d8934e84aa6be4d02b81343
-gcr.io/iree-oss/cmake-python-swiftshader@sha256:f13737fbcf79a2d9a6549e66ec75ecb05a6a35edeb32709a1b4c5f13f0ab955e
+gcr.io/iree-oss/cmake-python-swiftshader@sha256:9311f8a6e2a8bed546bc97f22751bd383d269b38e56e46371fcbd8b2e7944ac5
 gcr.io/iree-oss/cmake-python-nvidia@sha256:d96ffdc44026bf112efca82a5150d783e8eba8976c7bc150863ec5868de40778
 gcr.io/iree-oss/cmake-bazel-frontends@sha256:7633ff2b483a07b6c786ffab40cca7cae64d6a211ad0e95ff55d3f1cd2dd1ea9
 gcr.io/iree-oss/cmake-bazel-frontends-vulkan@sha256:cdf41d7ee7707eb3e79d56f2f1f8bd7e9a0ac3a1122dc4f89f8190154796a6bc
 gcr.io/iree-oss/cmake-bazel-frontends-nvidia@sha256:7a2189f9c2c5491878fdf6d38ddab18832020a06285eeff31b8376b9415fb7e9
-gcr.io/iree-oss/cmake-bazel-frontends-swiftshader@sha256:26fbfb5e4813a2d118b68694398de950c808228757a79f2a8909dd332792d9d3
+gcr.io/iree-oss/cmake-bazel-frontends-swiftshader@sha256:da13c13dc427d9d2bdcba0ee70615088a89b5efd383625d9945eaf2ec17890bd
 gcr.io/iree-oss/cmake-riscv@sha256:95489593bc9b0cd325ce9c1a32b47389c01b174a5b8190a16d937d2e8828d384
 gcr.io/iree-oss/cmake-bazel-frontends-android@sha256:cdb1b38772643f7acbc296b558ccc868900a47a1378cf63da3fbe469dcf42428
-gcr.io/iree-oss/samples@sha256:30fe0cbaf16f8a523319e74ad96de7b8d6d5cd49edc76efdcdc34203fa8ea69f
+gcr.io/iree-oss/samples@sha256:a2be19816f8bc7aa417423e8ed29a537350cd7e1c9a6f6daa023fc90656fbced
 gcr.io/iree-oss/cmake-emscripten@sha256:8acad361d23cb586187c2ea29df3a1ab301b5283c3648beb328681d69ecd0ab0
diff --git a/build_tools/docker/rbe-toolchain/Dockerfile b/build_tools/docker/rbe-toolchain/Dockerfile
index f7df137..66c615e 100644
--- a/build_tools/docker/rbe-toolchain/Dockerfile
+++ b/build_tools/docker/rbe-toolchain/Dockerfile
@@ -34,7 +34,7 @@
   python3 \
   zlib1g-dev
 
-ARG SWIFTSHADER_COMMIT=e32890c54793664e21891a504793cc8c0c7a34fb
+ARG SWIFTSHADER_COMMIT=755b78dc66b2362621a78b6964a9df3af94e960c
 
 RUN git clone https://github.com/google/swiftshader
 RUN cd swiftshader && git checkout "${SWIFTSHADER_COMMIT?}" && cd ..
diff --git a/build_tools/docker/samples/Dockerfile b/build_tools/docker/samples/Dockerfile
index 403d5d3..303aa44 100644
--- a/build_tools/docker/samples/Dockerfile
+++ b/build_tools/docker/samples/Dockerfile
@@ -9,7 +9,7 @@
 #   * Vulkan (using SwiftShader)
 #   * Python (including `venv` and common pip packages needed for Colab)
 
-FROM gcr.io/iree-oss/cmake-python-swiftshader@sha256:f13737fbcf79a2d9a6549e66ec75ecb05a6a35edeb32709a1b4c5f13f0ab955e AS final
+FROM gcr.io/iree-oss/cmake-python-swiftshader@sha256:9311f8a6e2a8bed546bc97f22751bd383d269b38e56e46371fcbd8b2e7944ac5 AS final
 
 # Update setuptools per https://github.com/pypa/setuptools/issues/1694#issuecomment-466010982
 RUN apt-get update && apt-get install -y python3-venv python3.7-venv python-setuptools && \
diff --git a/build_tools/docker/swiftshader/Dockerfile b/build_tools/docker/swiftshader/Dockerfile
index f588924..9ffd4c2 100644
--- a/build_tools/docker/swiftshader/Dockerfile
+++ b/build_tools/docker/swiftshader/Dockerfile
@@ -9,7 +9,7 @@
 
 RUN apt-get update && apt-get install -y git
 
-ARG SWIFTSHADER_COMMIT=e32890c54793664e21891a504793cc8c0c7a34fb
+ARG SWIFTSHADER_COMMIT=755b78dc66b2362621a78b6964a9df3af94e960c
 
 # zlib is needed for compiling SwiftShader.
 RUN apt-get update && apt-get install -y zlib1g-dev
diff --git a/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-swiftshader/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-swiftshader/build_kokoro.sh
index 8d13c4c..da102bd 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-swiftshader/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-swiftshader/build_kokoro.sh
@@ -24,7 +24,7 @@
 docker_setup
 
 docker run "${DOCKER_RUN_ARGS[@]?}" \
-  gcr.io/iree-oss/cmake-bazel-frontends-swiftshader@sha256:26fbfb5e4813a2d118b68694398de950c808228757a79f2a8909dd332792d9d3 \
+  gcr.io/iree-oss/cmake-bazel-frontends-swiftshader@sha256:da13c13dc427d9d2bdcba0ee70615088a89b5efd383625d9945eaf2ec17890bd \
   build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-swiftshader/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
diff --git a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader-asan/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader-asan/build_kokoro.sh
index e53e033..728b3ef 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader-asan/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader-asan/build_kokoro.sh
@@ -24,7 +24,7 @@
 docker_setup
 
 docker run "${DOCKER_RUN_ARGS[@]?}" \
-  gcr.io/iree-oss/cmake-python-swiftshader@sha256:f13737fbcf79a2d9a6549e66ec75ecb05a6a35edeb32709a1b4c5f13f0ab955e \
+  gcr.io/iree-oss/cmake-python-swiftshader@sha256:9311f8a6e2a8bed546bc97f22751bd383d269b38e56e46371fcbd8b2e7944ac5 \
   build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
diff --git a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh
index e53e033..728b3ef 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh
@@ -24,7 +24,7 @@
 docker_setup
 
 docker run "${DOCKER_RUN_ARGS[@]?}" \
-  gcr.io/iree-oss/cmake-python-swiftshader@sha256:f13737fbcf79a2d9a6549e66ec75ecb05a6a35edeb32709a1b4c5f13f0ab955e \
+  gcr.io/iree-oss/cmake-python-swiftshader@sha256:9311f8a6e2a8bed546bc97f22751bd383d269b38e56e46371fcbd8b2e7944ac5 \
   build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
diff --git a/build_tools/third_party/swiftshader/build_vk_swiftshader.sh b/build_tools/third_party/swiftshader/build_vk_swiftshader.sh
index 9289e62..9e466c1 100755
--- a/build_tools/third_party/swiftshader/build_vk_swiftshader.sh
+++ b/build_tools/third_party/swiftshader/build_vk_swiftshader.sh
@@ -54,7 +54,7 @@
   SWIFTSHADER_INSTALL_DIR="${BASE_DIR?}"'\.swiftshader'
 fi
 
-SWIFTSHADER_COMMIT=e32890c54793664e21891a504793cc8c0c7a34fb
+SWIFTSHADER_COMMIT=755b78dc66b2362621a78b6964a9df3af94e960c
 SWIFTSHADER_DIR="$(mktemp --directory --tmpdir swiftshader_XXXXXX)"
 
 #  Clone swiftshader and checkout the appropriate commit.
diff --git a/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 5d2ddb0..cb554f2 100644
--- a/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -221,7 +221,41 @@
       IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUVectorize, workgroupSize);
 }
 
+/// Propagate the configuration annotated in the incoming IR.
+static LogicalResult setUserConfig(FuncOp entryPointFn, Operation *computeOp,
+                                   IREE::HAL::LoweringConfig config) {
+  IREE::HAL::DispatchLoweringPassPipeline passPipeline =
+      IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUVectorize;
+  if (auto setPassPipeline = getLoweringPassPipeline(config)) {
+    passPipeline = setPassPipeline.getValue();
+  }
+  SmallVector<int64_t, 4> workgroupSize;
+  if (auto workgroupSizeAttr = config.workgroupSize()) {
+    workgroupSize = llvm::to_vector<4>(
+        llvm::map_range(workgroupSizeAttr, [](Attribute intAttr) {
+          return intAttr.cast<IntegerAttr>().getInt();
+        }));
+  }
+  if (failed(setOpConfigAndEntryPointFnTranslation(
+          entryPointFn, computeOp, config, passPipeline, workgroupSize))) {
+    return failure();
+  }
+  // Reset the op configuration to drop the pass-pipeline and workgroup size
+  // info. The op does not carry that information anymore.
+  auto resetConfig = IREE::HAL::LoweringConfig::get(
+      config.tileSizes(), config.nativeVectorSize(),
+      /*passPipeline =*/nullptr,
+      /*workgroupSize =*/nullptr, computeOp->getContext());
+  setLoweringConfig(computeOp, resetConfig);
+  return success();
+}
+
 static LogicalResult setRootConfig(FuncOp entryPointFn, Operation *computeOp) {
+  if (IREE::HAL::LoweringConfig config = getLoweringConfig(computeOp)) {
+    // If the op already has a lowering config coming from the IR use this and
+    // bypass the heuristic.
+    return setUserConfig(entryPointFn, computeOp, config);
+  }
   if (auto linalgOp = dyn_cast<linalg::LinalgOp>(computeOp)) {
     if (linalg::isaContractionOpInterface(linalgOp) &&
         linalgOp.getNumParallelLoops() >= 2) {
diff --git a/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir b/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
index 3665903..07cd247 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
@@ -364,3 +364,65 @@
 //       CHECK: func @static_3d_fft_stage3()
 //       CHECK:   linalg_ext.fft
 //  CHECK-SAME:     lowering.config = #[[CONFIG]]
+
+// -----
+
+hal.executable @user_config {
+hal.executable.variant public @cuda_nvptx_fb, target = #hal.executable.target<"cuda", "cuda-nvptx-fb"> {
+  hal.executable.entry_point public @_lowering_config_test_dispatch_1 attributes {interface = @io, ordinal = 0 : index}
+  builtin.module  {
+    func @_lowering_config_test_dispatch_1() {
+      %cst = constant 0.000000e+00 : f32
+      %c128 = constant 128 : index
+      %c1024 = constant 1024 : index
+      %c0 = constant 0 : index
+      %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:128x256xf32>
+      %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:256x1024xf32>
+      %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:128x1024xf32>
+      %workgroup_size_x = hal.interface.workgroup.size[0] : index
+      %workgroup_size_y = hal.interface.workgroup.size[1] : index
+      %workgroup_id_x = hal.interface.workgroup.id[0] : index
+      %workgroup_count_x = hal.interface.workgroup.count[0] : index
+      %workgroup_id_y = hal.interface.workgroup.id[1] : index
+      %workgroup_count_y = hal.interface.workgroup.count[1] : index
+      %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+      %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+      scf.for %arg0 = %3 to %c128 step %4 {
+        %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+        %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+        scf.for %arg1 = %5 to %c1024 step %6 {
+          %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 128)>(%arg0)[%workgroup_size_y]
+          %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:128x256xf32> -> tensor<?x256xf32>
+          %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 1024)>(%arg1)[%workgroup_size_x]
+          %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [256, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:256x1024xf32> -> tensor<256x?xf32>
+          %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 128)>(%arg0)[%workgroup_size_y]
+          %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 1024)>(%arg1)[%workgroup_size_x]
+          %13 = affine.min affine_map<(d0)[s0] -> (-d0 + 128, s0)>(%arg0)[%workgroup_size_y]
+          %14 = affine.min affine_map<(d0)[s0] -> (-d0 + 1024, s0)>(%arg1)[%workgroup_size_x]
+          %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
+          %16 = linalg.fill(%cst, %15) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+          %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {passPipeline = "LLVMGPUMatmulSimt", tileSizes = [[32, 256, 64], [], [4, 16]], workgroupSize = [16, 8, 1]}} ins(%8, %10 : tensor<?x256xf32>, tensor<256x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
+          flow.dispatch.tensor.store %17, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:128x1024xf32>
+        }
+      }
+      return
+    }
+    hal.interface private @io {
+      hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+      hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+      hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+    }
+  }
+}
+}
+
+//  CHECK-DAG: #[[CONFIG:.+]] = {{{.*}}tileSizes = {{\[}}[32, 256, 64], [], [4, 16]{{\]}}}
+//      CHECK: hal.executable.entry_point public @_lowering_config_test_dispatch_1
+// CHECK-SAME:     passPipeline = "LLVMGPUMatmulSimt"
+// CHECK-SAME:     workloadPerWorkgroup = [256, 32]
+// CHECK-SAME:     workgroup_size = [16 : index, 8 : index, 1 : index]
+//      CHECK: func @_lowering_config_test_dispatch_1
+//      CHECK:   linalg.fill
+// CHECK-SAME:       lowering.config = #[[CONFIG]]
+//      CHECK:   linalg.matmul
+// CHECK-SAME:       lowering.config = #[[CONFIG]]
diff --git a/iree/compiler/Codegen/SPIRV/AdrenoConfig.cpp b/iree/compiler/Codegen/SPIRV/AdrenoConfig.cpp
index 37fac74..38240a8 100644
--- a/iree/compiler/Codegen/SPIRV/AdrenoConfig.cpp
+++ b/iree/compiler/Codegen/SPIRV/AdrenoConfig.cpp
@@ -13,9 +13,6 @@
 #include <array>
 
 #include "iree/compiler/Codegen/SPIRV/KernelConfig.h"
-#include "iree/compiler/Codegen/Transforms/Transforms.h"
-#include "iree/compiler/Codegen/Utils/Utils.h"
-#include "llvm/Support/MathExtras.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 
 namespace mlir {
@@ -23,100 +20,6 @@
 namespace detail {
 
 //===----------------------------------------------------------------------===//
-// Matmul
-//===----------------------------------------------------------------------===//
-
-static LogicalResult setOpConfig(linalg::LinalgOp op) {
-  ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
-  ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);
-  if (llvm::any_of(lhsShape, ShapedType::isDynamic)) return success();
-  if (llvm::any_of(rhsShape, ShapedType::isDynamic)) return success();
-
-  bool isBM = isa<linalg::BatchMatmulOp>(op);
-
-  int64_t dimM = lhsShape[0 + isBM];
-  int64_t dimK = lhsShape[1 + isBM];
-  int64_t dimN = rhsShape[1 + isBM];
-
-  // The core idea is to distribute the matmul M/N dimension to the workgroup
-  // Y/X dimension, with each thread in a workgroup handling multiple vector
-  // elements. We start from the best (X, Y) and the tiling sizes for (M, N, K)
-  // and try different configurations by scaling them down until we find a
-  // configuration that can perfectly tile the input matmul.
-
-  const int64_t bestX = 32, bestY = 2;
-  const int64_t bestThreadM = 8, bestThreadN = 8, bestThreadK = 4;
-
-  int64_t residualThreads = bestX * bestY;
-  int64_t residualTilingFactor = (bestThreadM + bestThreadK) * bestThreadN;
-
-  SmallVector<int64_t, 3> workgroupSize(3, 1);               // (X, Y, Z)
-  SmallVector<int64_t, 4> workgroupTileSizes(3 + isBM, 0);   // (B, M, N, K)
-  SmallVector<int64_t, 4> invocationTileSizes(3 + isBM, 0);  // (B, M, N, K)
-
-  if (isBM) workgroupTileSizes[0] = invocationTileSizes[0] = 1;
-
-  // Deduce the configuration for the N dimension. Start with the best workgroup
-  // X size, and reduce by a factor of two each time.
-  for (int64_t x = bestX; x >= 2; x >>= 1) {
-    // Handle 4 elements per thread for the innermost dimension. We need this
-    // for vectorized load.
-    int64_t chosenTileSize = 4;
-    if (dimN % (x * chosenTileSize) == 0) {
-      workgroupSize[0] = x;
-      workgroupTileSizes[1 + isBM] = x * chosenTileSize;
-      invocationTileSizes[1 + isBM] = chosenTileSize;
-      residualThreads /= x;
-      assert(residualTilingFactor % chosenTileSize == 0);
-      residualTilingFactor /= chosenTileSize;
-      break;
-    }
-  }
-  if (workgroupTileSizes[1 + isBM] == 0) return success();
-
-  // Deduce the configuration for the M dimension. Start with the best workgroup
-  // Y size, and reduce by a factor of two each time.
-  for (int64_t y = residualThreads; y >= 1; y >>= 1) {
-    int64_t chosenTileSize = 0;
-    // Reduce the thread tiling size by one each time. We read one row each
-    // time; so it's fine to not be some power of two here.
-    for (int64_t t = bestThreadM; t >= 1; --t) {
-      if (dimM % (y * t) == 0) {
-        chosenTileSize = t;
-        break;
-      }
-    }
-    if (chosenTileSize) {
-      workgroupSize[1] = y;
-      workgroupTileSizes[0 + isBM] = y * chosenTileSize;
-      invocationTileSizes[0 + isBM] = chosenTileSize;
-      assert(residualTilingFactor > chosenTileSize);
-      residualTilingFactor -= chosenTileSize;
-      break;
-    }
-  }
-  if (workgroupTileSizes[0 + isBM] == 0) return success();
-
-  // Deduce the configuration for the K dimension. We need some power of two
-  // here so that we can do vector load.
-  for (int64_t t = llvm::PowerOf2Floor(residualTilingFactor); t >= 1; t >>= 1) {
-    if (dimK % t == 0) {
-      workgroupTileSizes[2 + isBM] = invocationTileSizes[2 + isBM] = t;
-      break;
-    }
-  }
-
-  auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
-  TileSizesListType tileSizes;
-  tileSizes.push_back(workgroupTileSizes);
-  tileSizes.emplace_back();
-  tileSizes.push_back(invocationTileSizes);
-  return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
-                                               op, tileSizes, {}, pipeline,
-                                               workgroupSize);
-}
-
-//===----------------------------------------------------------------------===//
 // Entry Point
 //===----------------------------------------------------------------------===//
 
@@ -124,8 +27,11 @@
                                      Operation *rootOp) {
   int64_t subgroupSize = targetEnv.getResourceLimits().subgroup_size().getInt();
   return TypeSwitch<Operation *, LogicalResult>(rootOp)
-      .Case<linalg::BatchMatmulOp, linalg::MatmulOp>(
-          [](auto op) { return setOpConfig(op); })
+      .Case<linalg::BatchMatmulOp, linalg::MatmulOp>([](auto op) {
+        std::array<int64_t, 2> workgroupXY = {32, 2};
+        std::array<int64_t, 3> threadMNK = {16, 4, 4};
+        return setMatmulOpConfig(op, workgroupXY, threadMNK);
+      })
       .Case<linalg::Conv2DNhwcHwcfOp>([subgroupSize](auto op) {
         return setConvOpConfig(op, subgroupSize,
                                /*bestTilingFactor=*/32);
diff --git a/iree/compiler/Codegen/SPIRV/KernelConfig.cpp b/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
index d8cd080..fa57109 100644
--- a/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
+++ b/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
@@ -29,21 +29,6 @@
 // Utilities
 //===----------------------------------------------------------------------===//
 
-/// Given `nprocs`, tries to distribute it evenly across 2 logical dimensions.
-static std::tuple<int64_t, int64_t> distributeProcs2D(int64_t nprocs) {
-  int64_t nprocs_x = std::max<int64_t>(
-      1, static_cast<int64_t>(
-             llvm::PowerOf2Ceil(static_cast<uint64_t>(std::sqrt(nprocs)))));
-  return std::make_tuple(nprocs_x, nprocs / nprocs_x);
-}
-
-/// Returns the minimum of `shape` and `tileSize` if shape is static.
-/// Returns `tileSize` otherwise.
-int64_t getMinIfStaticShape(int64_t shape, int64_t tileSize) {
-  if (shape == ShapedType::kDynamicSize) return tileSize;
-  return std::min(shape, tileSize);
-}
-
 /// Defines the workgroup count region on entry point ops for the
 /// `SPIRVDistributeToGlobalID` pipeline.
 // TODO(ravishankarm): Remove this when that pipeline is deprecated.
@@ -218,79 +203,108 @@
 // Matmul Default Configuration
 //===----------------------------------------------------------------------===//
 
-static LogicalResult setOpConfig(spirv::ResourceLimitsAttr limits,
-                                 linalg::BatchMatmulOp op) {
-  unsigned maxWorkgroupSize =
-      limits.max_compute_workgroup_invocations().getInt();
+namespace detail {
 
-  // This is just being hard-wired for now to be minimal viable, but this can be
-  // decided better when we have better estimates of device charecteristics.
-  const int64_t numRowsPerThread = 1;
-  const int64_t numColsPerThread = 1;
-  const int64_t numBatchesPerThread = 1;
-  const int64_t tileSizeK = 0;
-
-  std::array<int64_t, 3> workgroupSize = {1, 1, 1};
-  std::tie(workgroupSize[0], workgroupSize[1]) =
-      distributeProcs2D(maxWorkgroupSize);
-
-  auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute;
-
-  TileSizesListType tileSizes;
-  // Workgroup level.
-  tileSizes.push_back({numBatchesPerThread, numRowsPerThread * workgroupSize[1],
-                       numColsPerThread * workgroupSize[0], tileSizeK});
-  // No tiling at the subgroup level since this target doesn't use subgroup op
-  // or shared memory.
-  tileSizes.emplace_back();
-  // Invocation level.
-  tileSizes.push_back(
-      {numBatchesPerThread, numRowsPerThread, numColsPerThread, 0});
-
-  return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
-                                               op, tileSizes, {}, pipeline,
-                                               workgroupSize);
-}
-
-static LogicalResult setOpConfig(spirv::ResourceLimitsAttr limits,
-                                 linalg::MatmulOp op) {
-  unsigned maxWorkgroupSize =
-      limits.max_compute_workgroup_invocations().getInt();
-
-  std::array<int64_t, 3> workgroupSize = {1, 1, 1};
-  std::tie(workgroupSize[0], workgroupSize[1]) =
-      distributeProcs2D(maxWorkgroupSize);
-
-  const int numRowsPerThread = 1;
-  const int numColsPerThread = 1;
-  int64_t tileSizeK = 0;
-
+LogicalResult setMatmulOpConfig(linalg::LinalgOp op,
+                                std::array<int64_t, 2> bestWorkgroupSizeXY,
+                                std::array<int64_t, 3> bestThreadTileSizeMNK) {
   ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
   ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);
+  if (llvm::any_of(lhsShape, ShapedType::isDynamic)) return success();
+  if (llvm::any_of(rhsShape, ShapedType::isDynamic)) return success();
 
-  int64_t M = lhsShape[0];
-  int64_t N = rhsShape[1];
-  int64_t K = lhsShape[1];
+  bool isBM = isa<linalg::BatchMatmulOp>(op);
 
-  auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute;
+  int64_t dimM = lhsShape[0 + isBM];
+  int64_t dimK = lhsShape[1 + isBM];
+  int64_t dimN = rhsShape[1 + isBM];
 
+  // The core idea is to distribute the matmul M/N dimension to the workgroup
+  // Y/X dimension, with each thread in a workgroup handling multiple vector
+  // elements. We start from the best (X, Y) and the tiling sizes for (M, N, K)
+  // and try different configurations by scaling them down until we find a
+  // configuration that can perfectly tile the input matmul.
+
+  const int64_t bestX = bestWorkgroupSizeXY[0], bestY = bestWorkgroupSizeXY[1];
+  const int64_t bestThreadM = bestThreadTileSizeMNK[0],
+                bestThreadN = bestThreadTileSizeMNK[1],
+                bestThreadK = bestThreadTileSizeMNK[2];
+
+  int64_t residualThreads = bestX * bestY;
+  int64_t residualTilingFactor = (bestThreadM + bestThreadK) * bestThreadN;
+
+  SmallVector<int64_t, 3> workgroupSize(3, 1);               // (X, Y, Z)
+  SmallVector<int64_t, 4> workgroupTileSizes(3 + isBM, 0);   // (B, M, N, K)
+  SmallVector<int64_t, 4> invocationTileSizes(3 + isBM, 0);  // (B, M, N, K)
+
+  if (isBM) workgroupTileSizes[0] = invocationTileSizes[0] = 1;
+
+  // Deduce the configuration for the N dimension. Start with the best workgroup
+  // X size, and reduce by a factor of two each time.
+  for (int64_t x = bestX; x >= 2; x >>= 1) {
+    // Handle 4 elements per thread for the innermost dimension. We need this
+    // for vectorized load.
+    int64_t chosenTileSize = bestThreadN;
+    if (dimN % (x * chosenTileSize) == 0) {
+      workgroupSize[0] = x;
+      workgroupTileSizes[1 + isBM] = x * chosenTileSize;
+      invocationTileSizes[1 + isBM] = chosenTileSize;
+      residualThreads /= x;
+      assert(residualTilingFactor % chosenTileSize == 0);
+      residualTilingFactor /= chosenTileSize;
+      break;
+    }
+  }
+  if (workgroupTileSizes[1 + isBM] == 0) return success();
+
+  // Deduce the configuration for the M dimension. Start with the best workgroup
+  // Y size, and reduce by a factor of two each time.
+  for (int64_t y = residualThreads; y >= 1; y >>= 1) {
+    int64_t chosenTileSize = 0;
+    // Reduce the thread tiling size by one each time. We read one row each
+    // time; so it's fine to not be some power of two here.
+    for (int64_t t = bestThreadM; t >= 1; --t) {
+      if (dimM % (y * t) == 0) {
+        chosenTileSize = t;
+        break;
+      }
+    }
+    if (chosenTileSize) {
+      workgroupSize[1] = y;
+      workgroupTileSizes[0 + isBM] = y * chosenTileSize;
+      invocationTileSizes[0 + isBM] = chosenTileSize;
+      assert(residualTilingFactor > chosenTileSize);
+      residualTilingFactor -= chosenTileSize;
+      break;
+    }
+  }
+  if (workgroupTileSizes[0 + isBM] == 0) return success();
+
+  // Deduce the configuration for the K dimension. We need some power of two
+  // here so that we can do vector load.
+  for (int64_t t = llvm::PowerOf2Floor(residualTilingFactor); t >= 1; t >>= 1) {
+    if (dimK % t == 0) {
+      workgroupTileSizes[2 + isBM] = invocationTileSizes[2 + isBM] = t;
+      break;
+    }
+  }
+
+  auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
   TileSizesListType tileSizes;
-  // Workgroup level.
-  tileSizes.push_back(
-      {getMinIfStaticShape(M, numRowsPerThread * workgroupSize[1]),
-       getMinIfStaticShape(N, numColsPerThread * workgroupSize[0]),
-       getMinIfStaticShape(K, tileSizeK)});
-  // No tiling at the subgroup level since this target doesn't use subgroup op
-  // or shared memory.
+  tileSizes.push_back(workgroupTileSizes);
   tileSizes.emplace_back();
-  // Invocation level.
-  tileSizes.push_back({1, 1, 0});
-
+  tileSizes.push_back(invocationTileSizes);
   return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
                                                op, tileSizes, {}, pipeline,
                                                workgroupSize);
 }
 
+}  // namespace detail
+
+//===----------------------------------------------------------------------===//
+// FFT Default Configuration
+//===----------------------------------------------------------------------===//
+
 static LogicalResult setOpConfig(spirv::ResourceLimitsAttr limits,
                                  linalg_ext::FftOp op) {
   const int64_t subgroupSize = limits.subgroup_size().getValue().getSExtValue();
@@ -441,10 +455,22 @@
   // Check whether there is actually a configuration found. If so, it's done.
   if (getLoweringConfig(rootOp)) return result;
 
-  // Otherwise fallback to use a default configuration.
+  // Otherwise fallback to use a default configuration that tiles and
+  // distributes/vectorizes.
   spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();
   return TypeSwitch<Operation *, LogicalResult>(rootOp)
-      .Case<linalg::BatchMatmulOp, linalg::MatmulOp, linalg_ext::FftOp>(
+      .Case<linalg::BatchMatmulOp, linalg::MatmulOp>([limits](auto op) {
+        // Try to tile and vectorize first.
+        std::array<int64_t, 2> workgroupXY = {32, 2};
+        std::array<int64_t, 3> threadMNK = {8, 8, 4};
+        auto result = detail::setMatmulOpConfig(op, workgroupXY, threadMNK);
+        if (failed(result)) return result;
+        if (getLoweringConfig(op)) return result;
+
+        // If unsuccessful, try to tile and distribute.
+        return setDefaultOpConfig(limits, op);
+      })
+      .Case<linalg_ext::FftOp>(
           [limits](auto op) { return setOpConfig(limits, op); })
       .Case<linalg::Conv2DNhwcHwcfOp, linalg::DepthwiseConv2DNhwOp>(
           [limits](auto op) {
diff --git a/iree/compiler/Codegen/SPIRV/KernelConfig.h b/iree/compiler/Codegen/SPIRV/KernelConfig.h
index 51838ee..c0d4f31 100644
--- a/iree/compiler/Codegen/SPIRV/KernelConfig.h
+++ b/iree/compiler/Codegen/SPIRV/KernelConfig.h
@@ -33,6 +33,12 @@
                               const int64_t subgroupSize,
                               const int64_t bestTilingFactor);
 
+/// Sets CodeGen configurations via attributes to the given matmul `linalgOp`
+/// with the given best workgroup size and tile size hints.
+LogicalResult setMatmulOpConfig(linalg::LinalgOp linalgOp,
+                                std::array<int64_t, 2> bestWorkgroupSizeXY,
+                                std::array<int64_t, 3> bestThreadTileSizeMNK);
+
 /// Sets CodeGen configuration for GPUs from a specific vendor.
 ///
 /// If the given `rootOp` has known good CodeGen configuration, attaches a
diff --git a/iree/compiler/Codegen/SPIRV/MaliConfig.cpp b/iree/compiler/Codegen/SPIRV/MaliConfig.cpp
index 4f01291..1c19955 100644
--- a/iree/compiler/Codegen/SPIRV/MaliConfig.cpp
+++ b/iree/compiler/Codegen/SPIRV/MaliConfig.cpp
@@ -13,166 +13,12 @@
 #include <array>
 
 #include "iree/compiler/Codegen/SPIRV/KernelConfig.h"
-#include "iree/compiler/Codegen/Transforms/Transforms.h"
-#include "iree/compiler/Codegen/Utils/Utils.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 
 namespace mlir {
 namespace iree_compiler {
 namespace detail {
 
-struct TileWorkgroupSizePair {
-  // How many scalar elements each workgroup should handle along each dimension.
-  std::array<int64_t, 3> tileSize;
-  // The number of threads per workgroup along each dimension.
-  std::array<int64_t, 3> workgroupSize;
-};
-
-//===----------------------------------------------------------------------===//
-// Matmul
-//===----------------------------------------------------------------------===//
-
-/// Writes preferred matmul workgroup tile sizes and workgroup size into
-/// `pairs` for the given matmul `scale` (MxNxK) and `elementType`.
-static void getMatmulTileAndWorkgroupSizes(
-    int64_t scale, Type elementType,
-    SmallVectorImpl<TileWorkgroupSizePair> &pairs) {
-  if (elementType.isF16()) {
-    const int64_t smallMatrixSizeThreshold = 512 * 512;
-    // For smaller destination size we cannot fill out the GPU with bigger tile
-    // sizes. Instead we pick smaller tiles along M and N to increase the number
-    // of workgroups and a larger K tile size since we have lower pressure and
-    // need extra instructions to hide latency.
-    // TODO: The threshold needs to be fine tuned by doing exploration based on
-    // matrix shapes.
-    if (scale <= smallMatrixSizeThreshold) {
-      pairs.push_back(TileWorkgroupSizePair({{16, 32, 16}, {8, 2, 1}}));
-    } else {
-      pairs.push_back(TileWorkgroupSizePair({{16, 64, 4}, {8, 2, 1}}));
-      pairs.push_back(TileWorkgroupSizePair({{8, 128, 4}, {8, 2, 1}}));
-      pairs.push_back(TileWorkgroupSizePair({{16, 32, 4}, {8, 2, 1}}));
-    }
-    return;
-  }
-
-  // TODO: Heuristic picked based on MobileNet performance. We need
-  // auto-tuning to be able to make a smarter choice.
-  const int64_t smallMatrixSizeThreshold = 20000;
-
-  if (scale <= smallMatrixSizeThreshold) {
-    pairs.push_back(TileWorkgroupSizePair({{4, 32, 16}, {8, 2, 1}}));
-  }
-  pairs.push_back(TileWorkgroupSizePair({{12, 32, 4}, {8, 2, 1}}));
-  pairs.push_back(TileWorkgroupSizePair({{14, 32, 4}, {8, 2, 1}}));
-  pairs.push_back(TileWorkgroupSizePair({{10, 32, 4}, {8, 2, 1}}));
-  pairs.push_back(TileWorkgroupSizePair({{7, 64, 4}, {16, 1, 1}}));
-  pairs.push_back(TileWorkgroupSizePair({{8, 32, 4}, {8, 2, 1}}));
-  pairs.push_back(TileWorkgroupSizePair({{6, 32, 4}, {8, 2, 1}}));
-  pairs.push_back(TileWorkgroupSizePair({{24, 16, 4}, {2, 8, 1}}));
-  pairs.push_back(TileWorkgroupSizePair({{16, 16, 4}, {2, 8, 1}}));
-  pairs.push_back(TileWorkgroupSizePair({{24, 8, 4}, {2, 8, 1}}));
-  pairs.push_back(TileWorkgroupSizePair({{40, 8, 4}, {2, 8, 1}}));
-  pairs.push_back(TileWorkgroupSizePair({{32, 8, 4}, {2, 8, 1}}));
-  pairs.push_back(TileWorkgroupSizePair({{16, 8, 4}, {2, 8, 1}}));
-  pairs.push_back(TileWorkgroupSizePair({{1, 32, 16}, {8, 1, 1}}));
-  pairs.push_back(TileWorkgroupSizePair({{1, 32, 8}, {8, 1, 1}}));
-  pairs.push_back(TileWorkgroupSizePair({{1, 32, 4}, {8, 1, 1}}));
-}
-
-/// Launch configuration for Mali GPU configuration.
-static LogicalResult setOpConfig(linalg::BatchMatmulOp op) {
-  ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
-  ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);
-
-  if (llvm::any_of(lhsShape, ShapedType::isDynamic) ||
-      llvm::any_of(rhsShape, ShapedType::isDynamic)) {
-    return success();
-  }
-
-  // Get a vector of best tile size ordered from best to worst.
-  Type elementType =
-      op.inputs()[0].getType().cast<ShapedType>().getElementType();
-  int64_t matmulScale = lhsShape[0] * lhsShape[1] * rhsShape[2];
-  SmallVector<TileWorkgroupSizePair, 4> pairs;
-  getMatmulTileAndWorkgroupSizes(matmulScale, elementType, pairs);
-
-  for (TileWorkgroupSizePair pair : pairs) {
-    if (lhsShape[1] % pair.tileSize[0] != 0 ||
-        rhsShape[2] % pair.tileSize[1] != 0 ||
-        lhsShape[2] % pair.tileSize[2] != 0) {
-      continue;
-    }
-
-    auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
-
-    SmallVector<int64_t, 4> numElementsPerWorkgroup;
-    numElementsPerWorkgroup = {1, pair.tileSize[0], pair.tileSize[1],
-                               pair.tileSize[2]};
-
-    TileSizesListType tileSizes;
-    // Workgroup level.
-    tileSizes.push_back(numElementsPerWorkgroup);
-    // No tiling at the subgroup level since this target doesn't use subgroup op
-    // or shared memory.
-    tileSizes.emplace_back();
-    // Invocation level.
-    tileSizes.push_back({numElementsPerWorkgroup[0],
-                         numElementsPerWorkgroup[1] / pair.workgroupSize[1],
-                         numElementsPerWorkgroup[2] / pair.workgroupSize[0],
-                         numElementsPerWorkgroup[3]});
-
-    return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
-                                                 op, tileSizes, {}, pipeline,
-                                                 pair.workgroupSize);
-  }
-  return success();
-}
-
-static LogicalResult setOpConfig(linalg::MatmulOp op) {
-  ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
-  ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);
-
-  if (llvm::any_of(lhsShape, ShapedType::isDynamic) ||
-      llvm::any_of(rhsShape, ShapedType::isDynamic)) {
-    return success();
-  }
-
-  Type elementType =
-      op.inputs()[0].getType().cast<ShapedType>().getElementType();
-  int64_t matmulScale = lhsShape[0] * rhsShape[1];
-  SmallVector<TileWorkgroupSizePair, 4> pairs;
-  getMatmulTileAndWorkgroupSizes(matmulScale, elementType, pairs);
-
-  for (TileWorkgroupSizePair pair : pairs) {
-    if (lhsShape[0] % pair.tileSize[0] != 0 ||
-        rhsShape[1] % pair.tileSize[1] != 0 ||
-        lhsShape[1] % pair.tileSize[2] != 0) {
-      continue;
-    }
-
-    auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
-
-    SmallVector<int64_t, 4> numElementsPerWorkgroup(pair.tileSize.begin(),
-                                                    pair.tileSize.end());
-
-    TileSizesListType tileSizes;
-    // Workgroup level.
-    tileSizes.push_back(numElementsPerWorkgroup);
-    // No tiling at the subgroup level since this target doesn't use subgroup op
-    // or shared memory.
-    tileSizes.emplace_back();
-    // Invocation level.
-    tileSizes.push_back({numElementsPerWorkgroup[0] / pair.workgroupSize[1],
-                         numElementsPerWorkgroup[1] / pair.workgroupSize[0],
-                         numElementsPerWorkgroup[2]});
-
-    return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
-                                                 op, tileSizes, {}, pipeline,
-                                                 pair.workgroupSize);
-  }
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // Entry Point
 //===----------------------------------------------------------------------===//
@@ -181,8 +27,17 @@
                                    Operation *rootOp) {
   int64_t subgroupSize = targetEnv.getResourceLimits().subgroup_size().getInt();
   return TypeSwitch<Operation *, LogicalResult>(rootOp)
-      .Case<linalg::BatchMatmulOp, linalg::MatmulOp>(
-          [](auto op) { return setOpConfig(op); })
+      .Case<linalg::BatchMatmulOp, linalg::MatmulOp>([](auto op) {
+        std::array<int64_t, 2> workgroupXY = {8, 2};
+        std::array<int64_t, 3> threadMNK;
+        auto inputType = op.inputs()[0].getType().template cast<ShapedType>();
+        if (inputType.getElementType().isF16()) {
+          threadMNK = {8, 8, 4};
+        } else {
+          threadMNK = {6, 4, 4};
+        }
+        return setMatmulOpConfig(op, workgroupXY, threadMNK);
+      })
       .Case<linalg::Conv2DNhwcHwcfOp>([subgroupSize](auto op) {
         return setConvOpConfig(op, subgroupSize,
                                /*bestTilingFactor=*/16);
diff --git a/iree/compiler/Codegen/SPIRV/test/BUILD b/iree/compiler/Codegen/SPIRV/test/BUILD
index 440ed22..5f847d4 100644
--- a/iree/compiler/Codegen/SPIRV/test/BUILD
+++ b/iree/compiler/Codegen/SPIRV/test/BUILD
@@ -24,6 +24,7 @@
             "config_linalg_ext_ops.mlir",
             "config_linalg_ops.mlir",
             "config_mali_conv.mlir",
+            "config_mali_matmul.mlir",
             "convert_to_spirv.mlir",
             "distribute_to_global_id.mlir",
             "fold_gpu_procid_uses.mlir",
diff --git a/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt b/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
index 1495691..0d805b1 100644
--- a/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
+++ b/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
@@ -19,6 +19,7 @@
     "config_linalg_ext_ops.mlir"
     "config_linalg_ops.mlir"
     "config_mali_conv.mlir"
+    "config_mali_matmul.mlir"
     "convert_to_spirv.mlir"
     "distribute_to_global_id.mlir"
     "fold_gpu_procid_uses.mlir"
diff --git a/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir
index 9027f61..015e4d1 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir
@@ -63,17 +63,17 @@
 }
 
 //          CHECK-LABEL: hal.executable.entry_point public @matmul_1024x2048x512
-//           CHECK-SAME:   translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [128, 16]}
+//           CHECK-SAME:   translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [128, 32]}
 //           CHECK-SAME:   workgroup_size = [32 : index, 2 : index, 1 : index]
 //           CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
 //           CHECK-NEXT:   %[[ONE:.+]] = constant 1 : index
 //           CHECK-NEXT:   %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%[[X]]]
-//           CHECK-NEXT:   %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%[[Y]]]
+//           CHECK-NEXT:   %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[Y]]]
 //           CHECK-NEXT:   hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[ONE]]
 
 //                CHECK: func @matmul_1024x2048x512()
 //                CHECK:   linalg.matmul
-//  CHECK-SAME{LITERAL}:     lowering.config = {tileSizes = [[16, 128, 16], [], [8, 4, 16]]}
+//  CHECK-SAME{LITERAL}:     lowering.config = {tileSizes = [[32, 128, 4], [], [16, 4, 4]]}
 
 // -----
 
@@ -140,17 +140,17 @@
 }
 
 //          CHECK-LABEL: hal.executable.entry_point public @matmul_3136x24x96
-//           CHECK-SAME:   translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [8, 224]}
+//           CHECK-SAME:   translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [8, 448]}
 //           CHECK-SAME:   workgroup_size = [2 : index, 32 : index, 1 : index]
 //           CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
 //           CHECK-NEXT:   %[[ONE:.+]] = constant 1 : index
 //           CHECK-NEXT:   %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 8)>()[%[[X]]]
-//           CHECK-NEXT:   %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 224)>()[%[[Y]]]
+//           CHECK-NEXT:   %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 448)>()[%[[Y]]]
 //           CHECK-NEXT:   hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[ONE]]
 
 //                CHECK: func @matmul_3136x24x96()
 //                CHECK:   linalg.matmul
-//  CHECK-SAME{LITERAL}:     lowering.config = {tileSizes = [[224, 8, 16], [], [7, 4, 16]]}
+//  CHECK-SAME{LITERAL}:     lowering.config = {tileSizes = [[448, 8, 4], [], [14, 4, 4]]}
 
 // -----
 
@@ -227,7 +227,7 @@
 
 //                CHECK: func @matmul_196x64x192()
 //                CHECK:   linalg.matmul
-//  CHECK-SAME{LITERAL}:      lowering.config = {tileSizes = [[28, 64, 16], [], [7, 4, 16]]}
+//  CHECK-SAME{LITERAL}:      lowering.config = {tileSizes = [[28, 64, 8], [], [7, 4, 8]]}
 
 // -----
 
@@ -289,17 +289,17 @@
 }
 
 //          CHECK-LABEL: hal.executable.entry_point public @matmul_12544x96x16
-//           CHECK-SAME:   translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 64]}
+//           CHECK-SAME:   translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 128]}
 //           CHECK-SAME:   workgroup_size = [8 : index, 8 : index, 1 : index]
 //           CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
 //           CHECK-NEXT:   %[[ONE:.+]] = constant 1 : index
 //           CHECK-NEXT:   %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[X]]]
-//           CHECK-NEXT:   %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%[[Y]]]
+//           CHECK-NEXT:   %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%[[Y]]]
 //           CHECK-NEXT:   hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[ONE]]
 
 //                CHECK: func @matmul_12544x96x16()
 //                CHECK:   linalg.matmul
-//  CHECK-SAME{LITERAL}:     lowering.config =  {tileSizes = [[64, 32, 16], [], [8, 4, 16]]}
+//  CHECK-SAME{LITERAL}:     lowering.config =  {tileSizes = [[128, 32, 4], [], [16, 4, 4]]}
 
 // -----
 
@@ -376,7 +376,7 @@
 
 //                CHECK: func @matmul_49x160x576()
 //                CHECK:   linalg.matmul
-//  CHECK-SAME{LITERAL}:     lowering.config = {tileSizes = [[7, 32, 16], [], [7, 4, 16]]}
+//  CHECK-SAME{LITERAL}:     lowering.config = {tileSizes = [[7, 32, 8], [], [7, 4, 8]]}
 
 // -----
 
@@ -454,16 +454,16 @@
 }
 
 //          CHECK-LABEL: hal.executable.entry_point public @batch_matmul_4x384x384
-//           CHECK-SAME:   translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [128, 16, 1]}
+//           CHECK-SAME:   translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [128, 32, 1]}
 //           CHECK-SAME:   workgroup_size = [32 : index, 2 : index, 1 : index]
 //           CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %[[Z:.+]]: index):
 //           CHECK-NEXT:   %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%[[X]]]
-//           CHECK-NEXT:   %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%[[Y]]]
+//           CHECK-NEXT:   %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[Y]]]
 //           CHECK-NEXT:   hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[Z]]
 
 //                CHECK: func @batch_matmul_4x384x384()
 //                CHECK:   linalg.batch_matmul
-//  CHECK-SAME{LITERAL}:     lowering.config = {tileSizes = [[1, 16, 128, 16], [], [1, 8, 4, 16]]}
+//  CHECK-SAME{LITERAL}:     lowering.config = {tileSizes = [[1, 32, 128, 4], [], [1, 16, 4, 4]]}
 
 // -----
 
diff --git a/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir
new file mode 100644
index 0000000..9a04e8f
--- /dev/null
+++ b/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir
@@ -0,0 +1,553 @@
+// RUN: iree-opt -split-input-file -mlir-print-local-scope -pass-pipeline='hal.executable(hal.executable.variant(iree-spirv-lower-executable-target-pass{test-lowering-configuration=true}))' %s | IreeFileCheck %s
+
+// Large matmul that can match the best tiling scheme.
+
+hal.executable @matmul_1024x2048x512 {
+  hal.interface @io {
+    hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+    hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+    hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+  }
+  hal.executable.variant @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+      spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, {
+        max_compute_shared_memory_size = 32768 : i32,
+        max_compute_workgroup_invocations = 512 : i32,
+        max_compute_workgroup_size = dense<512> : vector<3xi32>,
+       subgroup_size = 16 : i32}>
+    }> {
+    hal.executable.entry_point @matmul_1024x2048x512 attributes {interface = @io, ordinal = 0 : index}
+    builtin.module  {
+      func @matmul_1024x2048x512() {
+        %c0 = constant 0 : index
+        %c2048 = constant 2048 : index
+        %c1024 = constant 1024 : index
+        %cst = constant 0.000000e+00 : f32
+        %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1024x512xf32>
+        %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:512x2048xf32>
+        %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1024x2048xf32>
+        %workgroup_size_x = hal.interface.workgroup.size[0] : index
+        %workgroup_size_y = hal.interface.workgroup.size[1] : index
+        %workgroup_id_x = hal.interface.workgroup.id[0] : index
+        %workgroup_count_x = hal.interface.workgroup.count[0] : index
+        %workgroup_id_y = hal.interface.workgroup.id[1] : index
+        %workgroup_count_y = hal.interface.workgroup.count[1] : index
+        %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+        %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+        scf.for %arg0 = %3 to %c1024 step %4 {
+          %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+          %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+          scf.for %arg1 = %5 to %c2048 step %6 {
+            %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 1024)>(%arg0)[%workgroup_size_y]
+            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<?x512xf32>
+            %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 2048)>(%arg1)[%workgroup_size_x]
+            %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [512, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:512x2048xf32> -> tensor<512x?xf32>
+            %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 1024)>(%arg0)[%workgroup_size_y]
+            %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 2048)>(%arg1)[%workgroup_size_x]
+            %13 = affine.min affine_map<(d0)[s0] -> (-d0 + 1024, s0)>(%arg0)[%workgroup_size_y]
+            %14 = affine.min affine_map<(d0)[s0] -> (-d0 + 2048, s0)>(%arg1)[%workgroup_size_x]
+            %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
+            %16 = linalg.fill(%cst, %15) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+            %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x512xf32>, tensor<512x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
+            flow.dispatch.tensor.store %17, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:1024x2048xf32>
+          }
+        }
+        return
+      }
+      hal.interface @io attributes {sym_visibility = "private"} {
+        hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+        hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+        hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+      }
+    }
+  }
+}
+
+//          CHECK-LABEL: hal.executable.entry_point public @matmul_1024x2048x512
+//           CHECK-SAME:   translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8]}
+//           CHECK-SAME:   workgroup_size = [8 : index, 2 : index, 1 : index]
+//           CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
+//           CHECK-NEXT:   %[[ONE:.+]] = constant 1 : index
+//           CHECK-NEXT:   %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[X]]]
+//           CHECK-NEXT:   %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 8)>()[%[[Y]]]
+//           CHECK-NEXT:   hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[ONE]]
+
+//                CHECK: func @matmul_1024x2048x512()
+//                CHECK:   linalg.matmul
+//  CHECK-SAME{LITERAL}:     lowering.config = {tileSizes = [[8, 32, 4], [], [4, 4, 4]]}
+
+// -----
+
+// Small matmul N that can still tile to all threads in a workgroup.
+
+hal.executable @matmul_3136x24x96 {
+  hal.interface @io {
+    hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+    hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+    hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+  }
+  hal.executable.variant @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+      spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, {
+        max_compute_shared_memory_size = 32768 : i32,
+        max_compute_workgroup_invocations = 512 : i32,
+        max_compute_workgroup_size = dense<512> : vector<3xi32>,
+       subgroup_size = 16 : i32}>
+    }> {
+    hal.executable.entry_point @matmul_3136x24x96 attributes {interface = @io, ordinal = 0 : index}
+    builtin.module  {
+      func @matmul_3136x24x96() {
+        %c0 = constant 0 : index
+        %c24 = constant 24 : index
+        %c3136 = constant 3136 : index
+        %cst = constant 0.000000e+00 : f32
+        %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:3136x96xf32>
+        %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:96x24xf32>
+        %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:3136x24xf32>
+        %workgroup_size_x = hal.interface.workgroup.size[0] : index
+        %workgroup_size_y = hal.interface.workgroup.size[1] : index
+        %workgroup_id_x = hal.interface.workgroup.id[0] : index
+        %workgroup_count_x = hal.interface.workgroup.count[0] : index
+        %workgroup_id_y = hal.interface.workgroup.id[1] : index
+        %workgroup_count_y = hal.interface.workgroup.count[1] : index
+        %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+        %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+        scf.for %arg0 = %3 to %c3136 step %4 {
+          %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+          %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+          scf.for %arg1 = %5 to %c24 step %6 {
+            %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 3136)>(%arg0)[%workgroup_size_y]
+            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 96], strides = [1, 1] : !flow.dispatch.tensor<readonly:3136x96xf32> -> tensor<?x96xf32>
+            %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 24)>(%arg1)[%workgroup_size_x]
+            %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [96, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:96x24xf32> -> tensor<96x?xf32>
+            %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 3136)>(%arg0)[%workgroup_size_y]
+            %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 24)>(%arg1)[%workgroup_size_x]
+            %13 = affine.min affine_map<(d0)[s0] -> (-d0 + 3136, s0)>(%arg0)[%workgroup_size_y]
+            %14 = affine.min affine_map<(d0)[s0] -> (-d0 + 24, s0)>(%arg1)[%workgroup_size_x]
+            %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
+            %16 = linalg.fill(%cst, %15) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+            %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x96xf32>, tensor<96x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
+            flow.dispatch.tensor.store %17, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:3136x24xf32>
+          }
+        }
+        return
+      }
+      hal.interface @io attributes {sym_visibility = "private"} {
+        hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+        hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+        hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+      }
+    }
+  }
+}
+
+//          CHECK-LABEL: hal.executable.entry_point public @matmul_3136x24x96
+//           CHECK-SAME:   translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [8, 32]}
+//           CHECK-SAME:   workgroup_size = [2 : index, 8 : index, 1 : index]
+//           CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
+//           CHECK-NEXT:   %[[ONE:.+]] = constant 1 : index
+//           CHECK-NEXT:   %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 8)>()[%[[X]]]
+//           CHECK-NEXT:   %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[Y]]]
+//           CHECK-NEXT:   hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[ONE]]
+
+//                CHECK: func @matmul_3136x24x96()
+//                CHECK:   linalg.matmul
+//  CHECK-SAME{LITERAL}:     lowering.config = {tileSizes = [[32, 8, 4], [], [4, 4, 4]]}
+
+// -----
+
+// Small matmul M that can still tile to all threads in a workgroup.
+
+hal.executable @matmul_196x64x192 {
+  hal.interface @io {
+    hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+    hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+    hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+  }
+  hal.executable.variant @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+      spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, {
+        max_compute_shared_memory_size = 32768 : i32,
+        max_compute_workgroup_invocations = 512 : i32,
+        max_compute_workgroup_size = dense<512> : vector<3xi32>,
+       subgroup_size = 16 : i32}>
+    }> {
+    hal.executable.entry_point @matmul_196x64x192 attributes {interface = @io, ordinal = 0 : index}
+    builtin.module  {
+      func @matmul_196x64x192() {
+        %c0 = constant 0 : index
+        %c64 = constant 64 : index
+        %c196 = constant 196 : index
+        %cst = constant 0.000000e+00 : f32
+        %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:196x192xf32>
+        %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:192x64xf32>
+        %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:196x64xf32>
+        %workgroup_size_x = hal.interface.workgroup.size[0] : index
+        %workgroup_size_y = hal.interface.workgroup.size[1] : index
+        %workgroup_id_x = hal.interface.workgroup.id[0] : index
+        %workgroup_count_x = hal.interface.workgroup.count[0] : index
+        %workgroup_id_y = hal.interface.workgroup.id[1] : index
+        %workgroup_count_y = hal.interface.workgroup.count[1] : index
+        %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+        %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+        scf.for %arg0 = %3 to %c196 step %4 {
+          %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+          %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+          scf.for %arg1 = %5 to %c64 step %6 {
+            %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 196)>(%arg0)[%workgroup_size_y]
+            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 192], strides = [1, 1] : !flow.dispatch.tensor<readonly:196x192xf32> -> tensor<?x192xf32>
+            %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 64)>(%arg1)[%workgroup_size_x]
+            %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [192, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:192x64xf32> -> tensor<192x?xf32>
+            %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 196)>(%arg0)[%workgroup_size_y]
+            %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 64)>(%arg1)[%workgroup_size_x]
+            %13 = affine.min affine_map<(d0)[s0] -> (-d0 + 196, s0)>(%arg0)[%workgroup_size_y]
+            %14 = affine.min affine_map<(d0)[s0] -> (-d0 + 64, s0)>(%arg1)[%workgroup_size_x]
+            %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
+            %16 = linalg.fill(%cst, %15) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+            %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x192xf32>, tensor<192x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
+            flow.dispatch.tensor.store %17, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:196x64xf32>
+          }
+        }
+        return
+      }
+      hal.interface @io attributes {sym_visibility = "private"} {
+        hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+        hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+        hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+      }
+    }
+  }
+}
+
+//          CHECK-LABEL: hal.executable.entry_point public @matmul_196x64x192
+//           CHECK-SAME:   translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 4]}
+//           CHECK-SAME:   workgroup_size = [8 : index, 2 : index, 1 : index]
+//           CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
+//           CHECK-NEXT:   %[[ONE:.+]] = constant 1 : index
+//           CHECK-NEXT:   %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[X]]]
+//           CHECK-NEXT:   %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 4)>()[%[[Y]]]
+//           CHECK-NEXT:   hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[ONE]]
+
+//                CHECK: func @matmul_196x64x192()
+//                CHECK:   linalg.matmul
+//  CHECK-SAME{LITERAL}:      lowering.config = {tileSizes = [[4, 32, 8], [], [2, 4, 8]]}
+
+// -----
+
+// Small matmul K that can still tile to all threads in a workgroup.
+
+hal.executable @matmul_12544x96x16 {
+  hal.interface @io {
+    hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+    hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+    hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+  }
+  hal.executable.variant @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+      spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, {
+        max_compute_shared_memory_size = 32768 : i32,
+        max_compute_workgroup_invocations = 512 : i32,
+        max_compute_workgroup_size = dense<512> : vector<3xi32>,
+       subgroup_size = 16 : i32}>
+    }> {
+    hal.executable.entry_point @matmul_12544x96x16 attributes {interface = @io, ordinal = 0 : index}
+    builtin.module  {
+      func @matmul_12544x96x16() {
+        %c0 = constant 0 : index
+        %c96 = constant 96 : index
+        %c12544 = constant 12544 : index
+        %cst = constant 0.000000e+00 : f32
+        %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<12544x16xf32>
+        %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<16x96xf32>
+        %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<12544x96xf32>
+        %workgroup_size_x = hal.interface.workgroup.size[0] : index
+        %workgroup_size_y = hal.interface.workgroup.size[1] : index
+        %workgroup_id_x = hal.interface.workgroup.id[0] : index
+        %workgroup_count_x = hal.interface.workgroup.count[0] : index
+        %workgroup_id_y = hal.interface.workgroup.id[1] : index
+        %workgroup_count_y = hal.interface.workgroup.count[1] : index
+        %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+        %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+        scf.for %arg0 = %3 to %c12544 step %4 {
+          %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+          %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+          scf.for %arg1 = %5 to %c96 step %6 {
+            %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 12544)>(%arg0)[%workgroup_size_y]
+            %8 = memref.subview %0[%arg0, 0] [%7, 16] [1, 1] : memref<12544x16xf32> to memref<?x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 16 + s0 + d1)>>
+            %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 96)>(%arg1)[%workgroup_size_x]
+            %10 = memref.subview %1[0, %arg1] [16, %9] [1, 1] : memref<16x96xf32> to memref<16x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 96 + s0 + d1)>>
+            %11 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<12544x96xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 96 + s0 + d1)>>
+            linalg.fill(%cst, %11) : f32, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 96 + s0 + d1)>>
+            linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : memref<?x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 16 + s0 + d1)>>, memref<16x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 96 + s0 + d1)>>) outs(%11 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 96 + s0 + d1)>>)
+          }
+        }
+        return
+      }
+      hal.interface @io attributes {sym_visibility = "private"} {
+        hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+        hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+        hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+      }
+    }
+  }
+}
+
+//          CHECK-LABEL: hal.executable.entry_point public @matmul_12544x96x16
+//           CHECK-SAME:   translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8]}
+//           CHECK-SAME:   workgroup_size = [8 : index, 2 : index, 1 : index]
+//           CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
+//           CHECK-NEXT:   %[[ONE:.+]] = constant 1 : index
+//           CHECK-NEXT:   %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[X]]]
+//           CHECK-NEXT:   %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 8)>()[%[[Y]]]
+//           CHECK-NEXT:   hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[ONE]]
+
+//                CHECK: func @matmul_12544x96x16()
+//                CHECK:   linalg.matmul
+//  CHECK-SAME{LITERAL}:     lowering.config =  {tileSizes = [[8, 32, 4], [], [4, 4, 4]]}
+
+// -----
+
+// Odd matmul M and small N that cannot utilize all threads in a workgroup.
+
+hal.executable @matmul_49x160x576 {
+  hal.interface @io {
+    hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+    hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+    hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+  }
+  hal.executable.variant @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+      spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, {
+        max_compute_shared_memory_size = 32768 : i32,
+        max_compute_workgroup_invocations = 512 : i32,
+        max_compute_workgroup_size = dense<512> : vector<3xi32>,
+       subgroup_size = 16 : i32}>
+    }> {
+    hal.executable.entry_point @matmul_49x160x576 attributes {interface = @io, ordinal = 0 : index}
+    builtin.module  {
+      func @matmul_49x160x576() {
+        %c0 = constant 0 : index
+        %c160 = constant 160 : index
+        %c49 = constant 49 : index
+        %cst = constant 0.000000e+00 : f32
+        %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:49x576xf32>
+        %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:576x160xf32>
+        %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:49x160xf32>
+        %workgroup_size_x = hal.interface.workgroup.size[0] : index
+        %workgroup_size_y = hal.interface.workgroup.size[1] : index
+        %workgroup_id_x = hal.interface.workgroup.id[0] : index
+        %workgroup_count_x = hal.interface.workgroup.count[0] : index
+        %workgroup_id_y = hal.interface.workgroup.id[1] : index
+        %workgroup_count_y = hal.interface.workgroup.count[1] : index
+        %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+        %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+        scf.for %arg0 = %3 to %c49 step %4 {
+          %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+          %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+          scf.for %arg1 = %5 to %c160 step %6 {
+            %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 49)>(%arg0)[%workgroup_size_y]
+            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 576], strides = [1, 1] : !flow.dispatch.tensor<readonly:49x576xf32> -> tensor<?x576xf32>
+            %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 160)>(%arg1)[%workgroup_size_x]
+            %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [576, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:576x160xf32> -> tensor<576x?xf32>
+            %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 49)>(%arg0)[%workgroup_size_y]
+            %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 160)>(%arg1)[%workgroup_size_x]
+            %13 = affine.min affine_map<(d0)[s0] -> (-d0 + 49, s0)>(%arg0)[%workgroup_size_y]
+            %14 = affine.min affine_map<(d0)[s0] -> (-d0 + 160, s0)>(%arg1)[%workgroup_size_x]
+            %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
+            %16 = linalg.fill(%cst, %15) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+            %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x576xf32>, tensor<576x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
+            flow.dispatch.tensor.store %17, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:49x160xf32>
+          }
+        }
+        return
+      }
+      hal.interface @io attributes {sym_visibility = "private"} {
+        hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+        hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+        hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+      }
+    }
+  }
+}
+
+//          CHECK-LABEL: hal.executable.entry_point public @matmul_49x160x576
+//           CHECK-SAME:   translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 1]}
+//           CHECK-SAME:   workgroup_size = [8 : index, 1 : index, 1 : index]
+//           CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
+//           CHECK-NEXT:   %[[ONE:.+]] = constant 1 : index
+//           CHECK-NEXT:   %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[X]]]
+//           CHECK-NEXT:   hal.return %[[X_COUNT]], %[[Y]], %[[ONE]]
+
+//                CHECK: func @matmul_49x160x576()
+//                CHECK:   linalg.matmul
+//  CHECK-SAME{LITERAL}:     lowering.config = {tileSizes = [[1, 32, 8], [], [1, 4, 8]]}
+
+// -----
+
+// Large batch matmul.
+
+hal.executable @batch_matmul_4x384x384 {
+  hal.interface @io {
+    hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+    hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+    hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+  }
+  hal.executable.variant @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+      spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, {
+        max_compute_shared_memory_size = 32768 : i32,
+        max_compute_workgroup_invocations = 512 : i32,
+        max_compute_workgroup_size = dense<512> : vector<3xi32>,
+       subgroup_size = 16 : i32}>
+    }> {
+    hal.executable.entry_point @batch_matmul_4x384x384 attributes {interface = @io, ordinal = 0 : index}
+    builtin.module  {
+      func @batch_matmul_4x384x384() {
+        %c0 = constant 0 : index
+        %c384 = constant 384 : index
+        %c4 = constant 4 : index
+        %cst = constant 0.000000e+00 : f32
+        %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:4x384x32xf32>
+        %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:4x32x384xf32>
+        %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:4x384x384xf32>
+        %workgroup_size_x = hal.interface.workgroup.size[0] : index
+        %workgroup_size_y = hal.interface.workgroup.size[1] : index
+        %workgroup_size_z = hal.interface.workgroup.size[2] : index
+        %workgroup_id_x = hal.interface.workgroup.id[0] : index
+        %workgroup_count_x = hal.interface.workgroup.count[0] : index
+        %workgroup_id_y = hal.interface.workgroup.id[1] : index
+        %workgroup_count_y = hal.interface.workgroup.count[1] : index
+        %workgroup_id_z = hal.interface.workgroup.id[2] : index
+        %workgroup_count_z = hal.interface.workgroup.count[2] : index
+        %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
+        %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
+        scf.for %arg0 = %3 to %c4 step %4 {
+          %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+          %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+          scf.for %arg1 = %5 to %c384 step %6 {
+            %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+            %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+            scf.for %arg2 = %7 to %c384 step %8 {
+              %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%arg0)[%workgroup_size_z]
+              %10 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 384)>(%arg1)[%workgroup_size_y]
+              %11 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%9, %10, 32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x384x32xf32> -> tensor<?x?x32xf32>
+              %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%arg0)[%workgroup_size_z]
+              %13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 384)>(%arg2)[%workgroup_size_x]
+              %14 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [%12, 32, %13], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x32x384xf32> -> tensor<?x32x?xf32>
+              %15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%arg0)[%workgroup_size_z]
+              %16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 384)>(%arg1)[%workgroup_size_y]
+              %17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 384)>(%arg2)[%workgroup_size_x]
+              %18 = affine.min affine_map<(d0)[s0] -> (-d0 + 4, s0)>(%arg0)[%workgroup_size_z]
+              %19 = affine.min affine_map<(d0)[s0] -> (-d0 + 384, s0)>(%arg1)[%workgroup_size_y]
+              %20 = affine.min affine_map<(d0)[s0] -> (-d0 + 384, s0)>(%arg2)[%workgroup_size_x]
+              %21 = linalg.init_tensor [%18, %19, %20] : tensor<?x?x?xf32>
+              %22 = linalg.fill(%cst, %21) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
+              %23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"} ins(%11, %14 : tensor<?x?x32xf32>, tensor<?x32x?xf32>) outs(%22 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+              flow.dispatch.tensor.store %23, %2, offsets = [%arg0, %arg1, %arg2], sizes = [%15, %16, %17], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:4x384x384xf32>
+            }
+          }
+        }
+        return
+      }
+      hal.interface @io attributes {sym_visibility = "private"} {
+        hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+        hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+        hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+      }
+    }
+  }
+}
+
+//          CHECK-LABEL: hal.executable.entry_point public @batch_matmul_4x384x384
+//           CHECK-SAME:   translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 12, 1]}
+//           CHECK-SAME:   workgroup_size = [8 : index, 2 : index, 1 : index]
+//           CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %[[Z:.+]]: index):
+//           CHECK-NEXT:   %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[X]]]
+//           CHECK-NEXT:   %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 12)>()[%[[Y]]]
+//           CHECK-NEXT:   hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[Z]]
+
+//                CHECK: func @batch_matmul_4x384x384()
+//                CHECK:   linalg.batch_matmul
+//  CHECK-SAME{LITERAL}:     lowering.config = {tileSizes = [[1, 12, 32, 4], [], [1, 6, 4, 4]]}
+
+// -----
+
+// Small batch matmul.
+
+hal.executable @batch_matmul_4x2x8 {
+  hal.interface @io {
+    hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+    hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+    hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+  }
+  hal.executable.variant @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+      spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, {
+        max_compute_shared_memory_size = 32768 : i32,
+        max_compute_workgroup_invocations = 512 : i32,
+        max_compute_workgroup_size = dense<512> : vector<3xi32>,
+       subgroup_size = 16 : i32}>
+    }> {
+    hal.executable.entry_point @batch_matmul_4x2x8 attributes {interface = @io, ordinal = 0 : index}
+    builtin.module  {
+      func @batch_matmul_4x2x8() {
+        %c0 = constant 0 : index
+        %c8 = constant 8 : index
+        %c2 = constant 2 : index
+        %c4 = constant 4 : index
+        %cst = constant 0.000000e+00 : f32
+        %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:4x2x32xf32>
+        %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:4x32x8xf32>
+        %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:4x2x8xf32>
+        %workgroup_size_x = hal.interface.workgroup.size[0] : index
+        %workgroup_size_y = hal.interface.workgroup.size[1] : index
+        %workgroup_size_z = hal.interface.workgroup.size[2] : index
+        %workgroup_id_x = hal.interface.workgroup.id[0] : index
+        %workgroup_count_x = hal.interface.workgroup.count[0] : index
+        %workgroup_id_y = hal.interface.workgroup.id[1] : index
+        %workgroup_count_y = hal.interface.workgroup.count[1] : index
+        %workgroup_id_z = hal.interface.workgroup.id[2] : index
+        %workgroup_count_z = hal.interface.workgroup.count[2] : index
+        %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
+        %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
+        scf.for %arg0 = %3 to %c4 step %4 {
+          %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+          %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+          scf.for %arg1 = %5 to %c2 step %6 {
+            %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+            %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+            scf.for %arg2 = %7 to %c8 step %8 {
+              %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%arg0)[%workgroup_size_z]
+              %10 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 2)>(%arg1)[%workgroup_size_y]
+              %11 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%9, %10, 32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x2x32xf32> -> tensor<?x?x32xf32>
+              %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%arg0)[%workgroup_size_z]
+              %13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 8)>(%arg2)[%workgroup_size_x]
+              %14 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [%12, 32, %13], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x32x8xf32> -> tensor<?x32x?xf32>
+              %15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%arg0)[%workgroup_size_z]
+              %16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 2)>(%arg1)[%workgroup_size_y]
+              %17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 8)>(%arg2)[%workgroup_size_x]
+              %18 = affine.min affine_map<(d0)[s0] -> (-d0 + 4, s0)>(%arg0)[%workgroup_size_z]
+              %19 = affine.min affine_map<(d0)[s0] -> (-d0 + 2, s0)>(%arg1)[%workgroup_size_y]
+              %20 = affine.min affine_map<(d0)[s0] -> (-d0 + 8, s0)>(%arg2)[%workgroup_size_x]
+              %21 = linalg.init_tensor [%18, %19, %20] : tensor<?x?x?xf32>
+              %22 = linalg.fill(%cst, %21) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
+              %23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"} ins(%11, %14 : tensor<?x?x32xf32>, tensor<?x32x?xf32>) outs(%22 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+              flow.dispatch.tensor.store %23, %2, offsets = [%arg0, %arg1, %arg2], sizes = [%15, %16, %17], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:4x2x8xf32>
+            }
+          }
+        }
+        return
+      }
+      hal.interface @io attributes {sym_visibility = "private"} {
+        hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+        hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+        hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+      }
+    }
+  }
+}
+
+//          CHECK-LABEL: hal.executable.entry_point public @batch_matmul_4x2x8
+//           CHECK-SAME:   translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [8, 2, 1]}
+//           CHECK-SAME:   workgroup_size = [2 : index, 2 : index, 1 : index]
+//           CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %[[Z:.+]]: index):
+//           CHECK-NEXT:   %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 8)>()[%[[X]]]
+//           CHECK-NEXT:   %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 2)>()[%[[Y]]]
+//           CHECK-NEXT:   hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[Z]]
+
+//                CHECK: func @batch_matmul_4x2x8()
+//                CHECK:   linalg.batch_matmul
+//  CHECK-SAME{LITERAL}:     lowering.config = {tileSizes = [[1, 2, 8, 8], [], [1, 1, 4, 8]]}
diff --git a/iree/compiler/Dialect/HAL/Target/VulkanSPIRV/VulkanSPIRVTarget.cpp b/iree/compiler/Dialect/HAL/Target/VulkanSPIRV/VulkanSPIRVTarget.cpp
index 49b983d..3afdcba 100644
--- a/iree/compiler/Dialect/HAL/Target/VulkanSPIRV/VulkanSPIRVTarget.cpp
+++ b/iree/compiler/Dialect/HAL/Target/VulkanSPIRV/VulkanSPIRVTarget.cpp
@@ -207,7 +207,7 @@
 
     SmallVector<mlir::ModuleOp, 8> innerModuleOps;
     innerModuleOps.reserve(sourceExecutableOpGroups.size());
-    for (auto hashExecutablePair : sourceExecutableOpGroups) {
+    for (const auto &hashExecutablePair : sourceExecutableOpGroups) {
       llvm::hash_code hash = hashExecutablePair.first;
       const auto &sourceExecutableOps = hashExecutablePair.second;
 
diff --git a/iree/vm/bytecode_dispatch_util.h b/iree/vm/bytecode_dispatch_util.h
index bce7fe8..5e17867 100644
--- a/iree/vm/bytecode_dispatch_util.h
+++ b/iree/vm/bytecode_dispatch_util.h
@@ -226,9 +226,9 @@
 #define VM_DecOperandRegF64(name)                          \
   *((double*)&regs.i32[OP_I16(0) & (regs.i32_mask & ~1)]); \
   pc += kRegSize;
-#define VM_DecOperandRegRef(name, out_is_move)             \
-  &regs.ref[OP_I16(0) & regs.ref_mask];                    \
-  *(out_is_move) = OP_I16(0) & IREE_REF_REGISTER_MOVE_BIT; \
+#define VM_DecOperandRegRef(name, out_is_move)                      \
+  &regs.ref[OP_I16(0) & regs.ref_mask];                             \
+  *(out_is_move) = 0; /*= OP_I16(0) & IREE_REF_REGISTER_MOVE_BIT;*/ \
   pc += kRegSize;
 #define VM_DecVariadicOperands(name) \
   VM_DecVariadicOperandsImpl(bytecode_data, &pc)
@@ -252,9 +252,9 @@
 #define VM_DecResultRegF64(name)                          \
   ((double*)&regs.i32[OP_I16(0) & (regs.i32_mask & ~1)]); \
   pc += kRegSize;
-#define VM_DecResultRegRef(name, out_is_move)              \
-  &regs.ref[OP_I16(0) & regs.ref_mask];                    \
-  *(out_is_move) = OP_I16(0) & IREE_REF_REGISTER_MOVE_BIT; \
+#define VM_DecResultRegRef(name, out_is_move)                       \
+  &regs.ref[OP_I16(0) & regs.ref_mask];                             \
+  *(out_is_move) = 0; /*= OP_I16(0) & IREE_REF_REGISTER_MOVE_BIT;*/ \
   pc += kRegSize;
 #define VM_DecVariadicResults(name) VM_DecVariadicOperands(name)
 
diff --git a/llvm-external-projects/iree-compiler-api/CMakeLists.txt b/llvm-external-projects/iree-compiler-api/CMakeLists.txt
index 4313f0b..76781b9 100644
--- a/llvm-external-projects/iree-compiler-api/CMakeLists.txt
+++ b/llvm-external-projects/iree-compiler-api/CMakeLists.txt
@@ -23,7 +23,7 @@
 # a CMake min version of 3.0, which causes them to set it locally to OLD.
 set(CMAKE_POLICY_DEFAULT_CMP0063 NEW)
 
-project(iree-compiler-backend LANGUAGES C CXX)
+project(iree-compiler-api LANGUAGES C CXX)
 
 # Directory layout.
 # When building in-tree, this directory exists relative to the overall
@@ -80,6 +80,10 @@
 # MLIR_ENABLE_BINDINGS_PYTHON option.
 set(MHLO_ENABLE_BINDINGS_PYTHON ON CACHE BOOL "" FORCE)
 
+# Required IREE settings.
+set(IREE_BUILD_PYTHON_BINDINGS ON CACHE BOOL "" FORCE)
+set(IREE_BUILD_OLD_PYTHON_COMPILER_API OFF CACHE BOOL "" FORCE)
+
 # TODO: Fix this upstream. Each of these system include hacks is broken in
 # a different way, so there is not an easy local fix. They should be removed
 # one be one until this project builds. Since this is the first time all of this
diff --git a/llvm-external-projects/iree-compiler-api/build_tools/build_intree.sh b/llvm-external-projects/iree-compiler-api/build_tools/build_intree.sh
index a7a950f..362dde9 100755
--- a/llvm-external-projects/iree-compiler-api/build_tools/build_intree.sh
+++ b/llvm-external-projects/iree-compiler-api/build_tools/build_intree.sh
@@ -11,12 +11,19 @@
 set -eu -o errtrace
 
 project_dir="$(cd $(dirname $0)/.. && pwd)"
+workspace_dir="$project_dir/../.."
 build_dir="$project_dir/build"
 
+# Write out a .env file to the workspace.
+echo "PYTHONPATH=$build_dir/python_package:$build_dir/iree/bindings/python" > $workspace_dir/.env
+
 cmake -GNinja -B"$build_dir" "$project_dir" \
   -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=lld \
+  -DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=lld \
+  -DCMAKE_MODULE_LINKER_FLAGS=-fuse-ld=lld \
+  -DLLVM_OPTIMIZED_TABLEGEN=ON \
   "$@"
 
 cd $build_dir
-ninja
+ninja all iree/bindings/python/all
diff --git a/llvm-external-projects/iree-dialects/BUILD b/llvm-external-projects/iree-dialects/BUILD
index eb4db6e..f476d4f 100644
--- a/llvm-external-projects/iree-dialects/BUILD
+++ b/llvm-external-projects/iree-dialects/BUILD
@@ -13,6 +13,7 @@
     srcs = glob([
         "include/iree-dialects/Dialect/IREE/*.td",
         "include/iree-dialects/Dialect/IREEPyDM/IR/*.td",
+        "include/iree-dialects/Dialect/IREEPyDM/Transforms/*.td",
     ]),
 )
 
@@ -144,20 +145,50 @@
     ],
 )
 
+gentbl_cc_library(
+    name = "IREEPyDMTransformsIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            ["-gen-pass-decls"],
+            "include/iree-dialects/Dialect/IREEPyDM/Transforms/Passes.h.inc",
+        ),
+        (
+            ["-gen-pass-capi-header"],
+            "include/iree-dialects/Dialect/IREEPyDM/Transforms/Passes.capi.h.inc",
+        ),
+        (
+            ["-gen-pass-capi-impl"],
+            "include/iree-dialects/Dialect/IREEPyDM/Transforms/Passes.capi.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/iree-dialects/Dialect/IREEPyDM/Transforms/Passes.td",
+    deps = [
+        ":TdFiles",
+        "@llvm-project//mlir:PassBaseTdFiles",
+    ],
+)
+
 cc_library(
     name = "IREEPyDMDialect",
     srcs = glob([
         "lib/Dialect/IREEPyDM/IR/*.cpp",
     ]),
-    hdrs = glob(["include/iree-dialects/Dialect/IREEPyDM/IR/*.h"]),
+    hdrs = glob([
+        "include/iree-dialects/Dialect/IREEPyDM/IR/*.h",
+        "include/iree-dialects/Dialect/IREEPyDM/Transforms/*.h",
+    ]),
     includes = ["include"],
     deps = [
         ":IREEPyDMInterfacesIncGen",
         ":IREEPyDMOpsIncGen",
+        ":IREEPyDMTransformsIncGen",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
     ],
 )
diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects-c/Dialects.h b/llvm-external-projects/iree-dialects/include/iree-dialects-c/Dialects.h
index 27a1ba2..613df95 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects-c/Dialects.h
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects-c/Dialects.h
@@ -8,6 +8,7 @@
 #define IREE_LLVM_EXTERNAL_PROJECTS_IREE_DIALECTS_C_DIALECTS_H
 
 #include "mlir-c/IR.h"
+#include "mlir-c/Pass.h"
 #include "mlir-c/Registration.h"
 
 #ifdef __cplusplus
@@ -50,6 +51,10 @@
 MLIR_CAPI_EXPORTED MlirType mlirIREEPyDMObjectTypeGet(MlirContext context,
                                                       MlirType primitive);
 
+/// Builds a pass pipeline which lowers the iree_pydm dialect to IREE.
+MLIR_CAPI_EXPORTED void mlirIREEPyDMBuildLowerToIREEPassPipeline(
+    MlirOpPassManager passManager);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/llvm-external-projects/iree-dialects/lib/CAPI/CMakeLists.txt b/llvm-external-projects/iree-dialects/lib/CAPI/CMakeLists.txt
index fc3317e..66682e5 100644
--- a/llvm-external-projects/iree-dialects/lib/CAPI/CMakeLists.txt
+++ b/llvm-external-projects/iree-dialects/lib/CAPI/CMakeLists.txt
@@ -5,6 +5,7 @@
   MLIRIR
   IREEDialectsIREEDialect
   IREEDialectsIREEPyDMDialect
+  IREEDialectsIREEPyDMToIREEPasses
 )
 
 iree_dialects_target_includes(IREEDialectsCAPI)
diff --git a/llvm-external-projects/iree-dialects/lib/CAPI/Dialects.cpp b/llvm-external-projects/iree-dialects/lib/CAPI/Dialects.cpp
index 56b053a..ad5e546 100644
--- a/llvm-external-projects/iree-dialects/lib/CAPI/Dialects.cpp
+++ b/llvm-external-projects/iree-dialects/lib/CAPI/Dialects.cpp
@@ -8,7 +8,13 @@
 
 #include "iree-dialects/Dialect/IREE/IREEDialect.h"
 #include "iree-dialects/Dialect/IREEPyDM/IR/Dialect.h"
+#include "iree-dialects/Dialect/IREEPyDM/Transforms/Passes.h"
+#include "mlir/CAPI/IR.h"
+#include "mlir/CAPI/Pass.h"
 #include "mlir/CAPI/Registration.h"
+#include "mlir/CAPI/Support.h"
+#include "mlir/CAPI/Utils.h"
+#include "mlir/CAPI/Wrap.h"
 
 //===----------------------------------------------------------------------===//
 // IREEDialect
@@ -59,3 +65,9 @@
   auto cppType = unwrap(primitive).cast<mlir::iree_pydm::PrimitiveType>();
   return wrap(mlir::iree_pydm::ObjectType::get(unwrap(ctx), cppType));
 }
+
+void mlirIREEPyDMBuildLowerToIREEPassPipeline(MlirOpPassManager passManager) {
+  auto *passManagerCpp = unwrap(passManager);
+  // TODO: Should be a pass pipeline, not loose passes in the C impl.
+  passManagerCpp->addPass(mlir::iree_pydm::createConvertIREEPyDMToIREEPass());
+}
diff --git a/llvm-external-projects/iree-dialects/python/IREEDialectsModule.cpp b/llvm-external-projects/iree-dialects/python/IREEDialectsModule.cpp
index 00d280a..0b235ee 100644
--- a/llvm-external-projects/iree-dialects/python/IREEDialectsModule.cpp
+++ b/llvm-external-projects/iree-dialects/python/IREEDialectsModule.cpp
@@ -74,6 +74,15 @@
       },
       py::arg("context") = py::none(), py::arg("load") = true);
 
+  iree_pydm_m.def(
+      "build_lower_to_iree_pass_pipeline",
+      [](MlirPassManager passManager) {
+        MlirOpPassManager opPassManager =
+            mlirPassManagerGetAsOpPassManager(passManager);
+        mlirIREEPyDMBuildLowerToIREEPassPipeline(opPassManager);
+      },
+      py::arg("pass_manager"));
+
 #define DEFINE_IREEPYDM_NULLARY_TYPE(Name)                                 \
   mlir_type_subclass(iree_pydm_m, #Name "Type", mlirTypeIsAIREEPyDM##Name, \
                      typeClass)                                            \