[LLVMGPU] Drop WorkgroupSpecializationPass (#18212)

This pass creates control flow in case the tile and distribute phase
results in dynamic dimensions. However `IREEComprehensiveBufferizePass`
is capable of handling this later on.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
index 91895b3..da45408 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
@@ -75,7 +75,6 @@
         "Passes.cpp",
         "VectorReductionToGPU.cpp",
         "WorkgroupReordering.cpp",
-        "WorkgroupSpecializationPass.cpp",
     ],
     hdrs = [
         "GPUPatterns.h",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
index 8905673..e22fa03 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
@@ -73,7 +73,6 @@
     "Passes.cpp"
     "VectorReductionToGPU.cpp"
     "WorkgroupReordering.cpp"
-    "WorkgroupSpecializationPass.cpp"
   DEPS
     ::PassHeaders
     ::PassesIncGen
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
index 88c3d24..36507fd 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
@@ -217,13 +217,4 @@
   ];
 }
 
-def WorkgroupSpecializationPass :
-    InterfacePass<"iree-codegen-workgroup-specialization", "mlir::FunctionOpInterface"> {
-  let summary = "Specialize workgroup distribution loops";
-  let dependentDialects = [
-    "::mlir::affine::AffineDialect", "::mlir::linalg::LinalgDialect",
-    "::mlir::scf::SCFDialect", "::mlir::tensor::TensorDialect",
-  ];
-}
-
 #endif // IREE_CODEGEN_COMMON_GPU_PASSES
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/WorkgroupSpecializationPass.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/WorkgroupSpecializationPass.cpp
deleted file mode 100644
index ccb966b..0000000
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/WorkgroupSpecializationPass.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-//=== WorkgroupSpecializationPass.cpp ------------------------------------===//
-//
-// This pass specializes the workgroup distribution loops with the tile sizes.
-//
-// For example, it converts
-//
-//   %tileSizeY = affine.min ...
-//   %tileSizeX = affine.min ...
-//   the_op with bounded tile sizes (The tensor is of dynamic shape.)
-//
-// into
-//
-//   %tileSizeY = affine.min ...
-//   %tileSizeX = affine.min ...
-//   %cmp0 = arith.cmpi %worksizeY, %tilesizeY
-//   %cmp1 = arith.cmpi %worksizeX, %tilesizeX
-//   %cond = arith.and %cmp0, %cmp1
-//   scf.if %cond
-//     operation with the static shape with the main tile sizes
-//   else
-//     original nested loops with dynamic shaped op
-//
-//===---------------------------------------------------------------------===//
-
-#include "iree/compiler/Codegen/Common/GPU/Passes.h"
-#include "iree/compiler/Codegen/Utils/Utils.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Interfaces/FunctionInterfaces.h"
-
-#define DEBUG_TYPE "iree-codegen-workgroup-specialization"
-
-namespace mlir::iree_compiler {
-
-#define GEN_PASS_DEF_WORKGROUPSPECIALIZATIONPASS
-#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
-
-namespace {
-static llvm::cl::opt<bool> clEnableWorkgroupSpecialization(
-    "iree-codegen-enable-workgroup-specialization",
-    llvm::cl::desc("Enable workgroup specialization."), llvm::cl::init(true));
-
-static std::optional<int64_t>
-getConstantLowerBound(affine::AffineMinOp affineMinOp) {
-  for (AffineExpr expr : affineMinOp.getMap().getResults()) {
-    if (auto cst = dyn_cast<AffineConstantExpr>(expr)) {
-      return cst.getValue();
-    }
-  }
-  return std::nullopt;
-}
-
-// Specialize the distributed function with the main tile sizes.
-//
-// Transformed output
-//   cond = (boundedTileSizeY != TileX) && (boundedTileSizeX != TileY) && ...
-//   scf.if cond
-//     distribution loops with static shapes with the tile size
-//   else
-//     distribution loops with dynamic shapes with the tile size
-//
-// Steps:
-// 1. Walk the code and collect affine.min that only depend on workgroup.id
-// and have one constant result.
-// 2. Move those at the top of the function
-// 3. Create a condition that ANDs all the affineMin == constant
-// 4. Splice the rest of the block and clone into a specialized if/else
-static void specializeFunction(mlir::FunctionOpInterface funcOp) {
-  SmallVector<affine::AffineMinOp> minSizeOps;
-  SmallVector<Operation *> ids;
-  funcOp.walk([&minSizeOps, &ids](Operation *op) {
-    if (auto affineMin = dyn_cast<affine::AffineMinOp>(op)) {
-      for (Value operand : affineMin->getOperands()) {
-        if (!operand.getDefiningOp<IREE::HAL::InterfaceWorkgroupIDOp>()) {
-          return WalkResult::advance();
-        }
-        ids.push_back(operand.getDefiningOp());
-      }
-      if (!getConstantLowerBound(affineMin)) {
-        return WalkResult::advance();
-      }
-      minSizeOps.push_back(affineMin);
-    }
-    return WalkResult::advance();
-  });
-  if (minSizeOps.empty()) {
-    return;
-  }
-
-  auto loc = funcOp.getLoc();
-  Block *block = &(*funcOp.getBlocks().begin());
-
-  OpBuilder builder(funcOp->getContext());
-  OpBuilder::InsertionGuard guard(builder);
-  // Move ops at the top of the function. This is always correct as those only
-  // depends on workgroup ids.
-  for (affine::AffineMinOp affineMin : llvm::reverse(minSizeOps)) {
-    affineMin->moveBefore(&block->front());
-  }
-  for (Operation *id : llvm::reverse(ids)) {
-    id->moveBefore(&block->front());
-  }
-  builder.setInsertionPointAfter(minSizeOps.back());
-  // create a condition for scf.if
-  Value cond;
-  SmallVector<Value> constantOps; // ConstantIndexOps for tile sizes
-  for (unsigned i = 0, e = minSizeOps.size(); i != e; ++i) {
-    affine::AffineMinOp minOp = minSizeOps[i];
-    int64_t lowerBound = *getConstantLowerBound(minOp);
-    // Generate a compare op that checks the dynamic size is equal to the
-    // constant main tile size.
-    Value constant = builder.create<arith::ConstantIndexOp>(loc, lowerBound);
-    constantOps.push_back(constant);
-    Value cmp = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
-                                              minOp, constant);
-    cond = cond ? builder.create<arith::AndIOp>(loc, cond, cmp) : cmp;
-  }
-
-  // generate scf.if %cond
-  auto ifOp = builder.create<scf::IfOp>(loc, cond, /*withElseRegion=*/true);
-
-  // Transfer the original body to the scf.else body.
-  auto origBodyBegin = ++Block::iterator(ifOp);
-  auto origBodyEnd = --block->end(); // yield
-
-  Block *elseBlock = ifOp.elseBlock();
-  elseBlock->getOperations().splice(elseBlock->begin(), block->getOperations(),
-                                    origBodyBegin, origBodyEnd);
-  // Clone the else block into the then block. minOps are replaced during the
-  // cloning.
-  auto b = ifOp.getThenBodyBuilder();
-  IRMapping bvm;
-  for (unsigned i = 0, e = minSizeOps.size(); i != e; ++i) {
-    if (minSizeOps[i]) {
-      bvm.map(minSizeOps[i], constantOps[i]);
-    }
-  }
-  for (auto &blockOp : elseBlock->without_terminator()) {
-    b.clone(blockOp, bvm);
-  }
-  return;
-}
-
-struct WorkgroupSpecializationPass final
-    : impl::WorkgroupSpecializationPassBase<WorkgroupSpecializationPass> {
-  void runOnOperation() override {
-    if (!clEnableWorkgroupSpecialization)
-      return;
-
-    FunctionOpInterface funcOp = getOperation();
-    specializeFunction(funcOp);
-  }
-};
-
-} // namespace
-} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
index 9651d49..7b47488 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
@@ -78,7 +78,6 @@
             "vectorize_memref_copy.mlir",
             "vectorize_tensor_pad.mlir",
             "vector_layout_analysis.mlir",
-            "workgroup_specialization.mlir",
         ],
         include = ["*.mlir"],
         exclude = [
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
index d2b97e2..adfafb2 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
@@ -74,7 +74,6 @@
     "vector_layout_analysis.mlir"
     "vectorize_memref_copy.mlir"
     "vectorize_tensor_pad.mlir"
-    "workgroup_specialization.mlir"
   TOOLS
     FileCheck
     iree-opt
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/workgroup_specialization.mlir b/compiler/src/iree/compiler/Codegen/Common/test/workgroup_specialization.mlir
deleted file mode 100644
index 1654fce..0000000
--- a/compiler/src/iree/compiler/Codegen/Common/test/workgroup_specialization.mlir
+++ /dev/null
@@ -1,150 +0,0 @@
-// RUN: iree-opt --iree-codegen-enable-workgroup-specialization --pass-pipeline="builtin.module(func.func(iree-codegen-workgroup-specialization),canonicalize,cse)" --split-input-file %s | FileCheck %s
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
-  #hal.descriptor_set.layout<0, bindings = [
-    #hal.descriptor_set.binding<0, storage_buffer>,
-    #hal.descriptor_set.binding<1, storage_buffer>,
-    #hal.descriptor_set.binding<2, storage_buffer>
-  ]>
-]>
-#config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0], [16, 4, 0], [0, 0, 64]]>
-#map = affine_map<()[s0] -> (s0 * 64)>
-#map1 = affine_map<()[s0] -> (s0 * -64 + 123, 64)>
-#map2 = affine_map<()[s0] -> (s0 * -64 + 789, 64)>
-func.func @matmul_tensors() {
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor<readonly:tensor<123x456xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor<readonly:tensor<456x789xf32>>
-  %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor<writeonly:tensor<123x789xf32>>
-  %workgroup_id_x = hal.interface.workgroup.id[0] : index
-  %workgroup_id_y = hal.interface.workgroup.id[1] : index
-  %3 = affine.apply #map()[%workgroup_id_y]
-  %4 = affine.min #map1()[%workgroup_id_y]
-  %5 = affine.apply #map()[%workgroup_id_x]
-  %6 = affine.min #map2()[%workgroup_id_x]
-  %7 = flow.dispatch.tensor.load %0, offsets = [%3, 0], sizes = [%4, 456], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<123x456xf32>> -> tensor<?x456xf32>
-  %8 = flow.dispatch.tensor.load %1, offsets = [0, %5], sizes = [456, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<456x789xf32>> -> tensor<456x?xf32>
-  %9 = tensor.empty(%4, %6) : tensor<?x?xf32>
-  %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %11 = linalg.matmul {lowering_config = #config} ins(%7, %8 : tensor<?x456xf32>, tensor<456x?xf32>) outs(%10 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  flow.dispatch.tensor.store %11, %2, offsets = [%3, %5], sizes = [%4, %6], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<123x789xf32>>
-  return
-}
-
-// CHECK: func.func @matmul_tensors()
-// CHECK:       %[[C64:.+]] = arith.constant 64 : index
-// CHECK:       %[[CMP0:.+]] = arith.cmpi eq, %{{.+}}, %[[C64]] : index
-// CHECK:       %[[CMP1:.+]] = arith.cmpi eq, %{{.+}}, %[[C64]] : index
-// CHECK:       %[[COND:.+]] = arith.andi %[[CMP0]], %[[CMP1]] : i1
-// CHECK:       scf.if %[[COND]] {
-// CHECK:         linalg.matmul
-// CHECK-SAME:                  ins(%{{.+}}, %{{.+}} : tensor<64x456xf32>, tensor<456x64xf32>) outs(%{{.+}} : tensor<64x64xf32>) -> tensor<64x64xf32>
-// CHECK:       } else {
-// CHECK:         linalg.matmul
-// CHECK-SAME:                  ins(%{{.+}}, %{{.+}} : tensor<?x456xf32>, tensor<456x?xf32>) outs(%{{.+}} : tensor<?x?xf32>) -> tensor<?x?xf32>
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
-  #hal.descriptor_set.layout<0, bindings = [
-    #hal.descriptor_set.binding<0, storage_buffer>,
-    #hal.descriptor_set.binding<1, storage_buffer>,
-    #hal.descriptor_set.binding<2, storage_buffer>
-  ]>
-]>
-#config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0], [16, 4, 0], [0, 0, 64]]>
-#map = affine_map<()[s0] -> (s0 * 64)>
-#map1 = affine_map<()[s0] -> (s0 * -64 + 123, 64)>
-#map2 = affine_map<()[s0] -> (s0 * -64 + 789, 64)>
-#map3 = affine_map<(d0, d1) -> (d0, d1)>
-func.func @add_tensors() {
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor<readonly:tensor<123x789xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor<readonly:tensor<123x789xf32>>
-  %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor<writeonly:tensor<123x789xf32>>
-  %workgroup_id_x = hal.interface.workgroup.id[0] : index
-  %workgroup_id_y = hal.interface.workgroup.id[1] : index
-  %3 = affine.apply #map()[%workgroup_id_y]
-  %4 = affine.min #map1()[%workgroup_id_y]
-  %5 = affine.apply #map()[%workgroup_id_x]
-  %6 = affine.min #map2()[%workgroup_id_x]
-  %7 = flow.dispatch.tensor.load %0, offsets = [%3, %5], sizes = [%4, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<123x789xf32>> -> tensor<?x?xf32>
-  %8 = flow.dispatch.tensor.load %1, offsets = [%3, %5], sizes = [%4, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<123x789xf32>> -> tensor<?x?xf32>
-  %9 = tensor.empty(%4, %6) : tensor<?x?xf32>
-  %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %11 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%7, %8 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) attrs =  {lowering_config = #config} {
-  ^bb0(%in: f32, %in_0: f32, %out: f32):
-    %12 = arith.addf %in, %in_0 : f32
-    linalg.yield %12 : f32
-  } -> tensor<?x?xf32>
-  flow.dispatch.tensor.store %11, %2, offsets = [%3, %5], sizes = [%4, %6], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<123x789xf32>>
-  return
-}
-
-// CHECK: func.func @add_tensors()
-// CHECK:       %[[C64:.+]] = arith.constant 64 : index
-// CHECK:       %[[CMP0:.+]] = arith.cmpi eq, %{{.+}}, %[[C64]] : index
-// CHECK:       %[[CMP1:.+]] = arith.cmpi eq, %{{.+}}, %[[C64]] : index
-// CHECK:       %[[COND:.+]] = arith.andi %[[CMP0]], %[[CMP1]] : i1
-// CHECK:       scf.if %[[COND]] {
-// CHECK:         linalg.generic
-// CHECK-SAME:                  ins(%{{.+}}, %{{.+}} : tensor<64x64xf32>, tensor<64x64xf32>) outs(%{{.+}} : tensor<64x64xf32>)
-// CHECK:       } else {
-// CHECK:         linalg.generic
-// CHECK-SAME:                  ins(%{{.+}}, %{{.+}} : tensor<?x?xf32>, tensor<?x?xf32>) outs(%{{.+}} : tensor<?x?xf32>)
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
-  #hal.descriptor_set.layout<0, bindings = [
-    #hal.descriptor_set.binding<0, storage_buffer>,
-    #hal.descriptor_set.binding<1, storage_buffer>,
-    #hal.descriptor_set.binding<2, storage_buffer>
-  ]>
-]>
-#config = #iree_codegen.lowering_config<tile_sizes = [[2, 256, 4]]>
-#map = affine_map<()[s0] -> (s0 * 2)>
-#map1 = affine_map<()[s0] -> (s0 * 256)>
-#map2 = affine_map<()[s0] -> (s0 * -256 + 30522, 256)>
-#map3 = affine_map<(d0, d1) -> (d0, d1)>
-#map4 = affine_map<(d0, d1) -> (d1)>
-func.func @unaligned_partial_loop() {
-  %c512 = arith.constant 512 : index
-  %c786944 = arith.constant 786944 : index
-  %c265458176 = arith.constant 265458176 : index
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c512) : !flow.dispatch.tensor<readonly:tensor<128x768xf32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c786944) : !flow.dispatch.tensor<readonly:tensor<768x30522xf32>>
-  %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c265458176) : !flow.dispatch.tensor<readonly:tensor<30522xf32>>
-  %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x30522xf32>>
-  %workgroup_id_x = hal.interface.workgroup.id[0] : index
-  %workgroup_id_y = hal.interface.workgroup.id[1] : index
-  %4 = affine.apply #map()[%workgroup_id_y]
-  %5 = affine.apply #map1()[%workgroup_id_x]
-  %6 = affine.min #map2()[%workgroup_id_x]
-  %7 = flow.dispatch.tensor.load %0, offsets = [%4, 0], sizes = [2, 768], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x768xf32>> -> tensor<2x768xf32>
-  %8 = flow.dispatch.tensor.load %1, offsets = [0, %5], sizes = [768, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<768x30522xf32>> -> tensor<768x?xf32>
-  %9 = tensor.empty(%6) : tensor<2x?xf32>
-  %10 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%9 : tensor<2x?xf32>) -> tensor<2x?xf32>
-  %11 = linalg.matmul {lowering_config = #config} ins(%7, %8 : tensor<2x768xf32>, tensor<768x?xf32>) outs(%10 : tensor<2x?xf32>) -> tensor<2x?xf32>
-  %12 = flow.dispatch.tensor.load %2, offsets = [%5], sizes = [%6], strides = [1] : !flow.dispatch.tensor<readonly:tensor<30522xf32>> -> tensor<?xf32>
-  %13 = tensor.empty(%6) : tensor<2x?xf32>
-  %14 = linalg.generic {indexing_maps = [#map3, #map4, #map3], iterator_types = ["parallel", "parallel"]} ins(%11, %12 : tensor<2x?xf32>, tensor<?xf32>) outs(%13 : tensor<2x?xf32>) attrs =  {lowering_config = #config} {
-  ^bb0(%in: f32, %in_0: f32, %out: f32):
-    %15 = arith.addf %in, %in_0 : f32
-    linalg.yield %15 : f32
-  } -> tensor<2x?xf32>
-  flow.dispatch.tensor.store %14, %3, offsets = [%4, %5], sizes = [2, %6], strides = [1, 1] : tensor<2x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x30522xf32>>
-  return
-}
-
-// CHECK: func.func @unaligned_partial_loop()
-// CHECK:       %[[C256:.+]] = arith.constant 256 : index
-// CHECK:       %[[COND:.+]] = arith.cmpi eq, %{{.+}}, %[[C256]] : index
-// CHECK:       scf.if %[[COND]] {
-// CHECK:         linalg.matmul
-// CHECK-SAME:                  ins(%{{.+}}, %{{.+}} : tensor<2x768xf32>, tensor<768x256xf32>) outs(%{{.+}} : tensor<2x256xf32>)
-// CHECK:       } else {
-// CHECK:         linalg.matmul
-// CHECK-SAME:                  ins(%{{.+}}, %{{.+}} : tensor<2x768xf32>, tensor<768x?xf32>) outs(%{{.+}} : tensor<2x?xf32>)
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 250645e..c4dde0d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -270,7 +270,6 @@
   tileAndDistributeToWorkgroup(funcPassManager);
 
   funcPassManager.addPass(createCanonicalizerPass());
-  funcPassManager.addPass(createWorkgroupSpecializationPass());
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
 
@@ -456,7 +455,6 @@
   tileAndDistributeToWorkgroup(funcPassManager);
 
   funcPassManager.addPass(createCanonicalizerPass());
-  funcPassManager.addPass(createWorkgroupSpecializationPass());
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
 
@@ -653,7 +651,6 @@
   tileAndDistributeToWorkgroup(funcPassManager);
 
   funcPassManager.addPass(createCanonicalizerPass());
-  funcPassManager.addPass(createWorkgroupSpecializationPass());
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index 67a8439..3ed0f36 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -77,7 +77,6 @@
             "vector_lowering.mlir",
             "vector_to_gpu.mlir",
             "winograd_pipeline_test.mlir",
-            "workgroup_specialization_pipeline_test.mlir",
         ],
         include = ["*.mlir"],
         # tensor_dialect_*_spec is a an MLIR file that specifies a
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index 46366b4..692ce93 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -73,7 +73,6 @@
     "vector_lowering.mlir"
     "vector_to_gpu.mlir"
     "winograd_pipeline_test.mlir"
-    "workgroup_specialization_pipeline_test.mlir"
   TOOLS
     FileCheck
     iree-opt
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir
deleted file mode 100644
index 73ffa19..0000000
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir
+++ /dev/null
@@ -1,114 +0,0 @@
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_80 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
-  #hal.descriptor_set.layout<0, bindings = [
-    #hal.descriptor_set.binding<0, storage_buffer>,
-    #hal.descriptor_set.binding<1, storage_buffer>,
-    #hal.descriptor_set.binding<2, storage_buffer>
-  ]>
-]>
-hal.executable private @forward_dispatch_116 {
-  hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
-    hal.executable.export public @forward_dispatch_116_matmul_128x30522x768 ordinal(0) layout(#pipeline_layout) {
-    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
-      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
-      hal.return %x, %y, %z : index, index, index
-    }
-    builtin.module {
-      func.func @forward_dispatch_116_matmul_128x30522x768() {
-        %c512 = arith.constant 512 : index
-        %c786944 = arith.constant 786944 : index
-        %c265458176 = arith.constant 265458176 : index
-        %c0 = arith.constant 0 : index
-        %cst = arith.constant 0.000000e+00 : f32
-        %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c512) : !flow.dispatch.tensor<readonly:tensor<128x768xf32>>
-        %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c786944) : !flow.dispatch.tensor<readonly:tensor<768x30522xf32>>
-        %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c265458176) : !flow.dispatch.tensor<readonly:tensor<30522xf32>>
-        %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x30522xf32>>
-        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 768], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x768xf32>> -> tensor<128x768xf32>
-        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [768, 30522], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<768x30522xf32>> -> tensor<768x30522xf32>
-        %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [30522], strides = [1] : !flow.dispatch.tensor<readonly:tensor<30522xf32>> -> tensor<30522xf32>
-        %7 = tensor.empty() : tensor<128x30522xf32>
-        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<128x30522xf32>) -> tensor<128x30522xf32>
-        %9 = linalg.matmul ins(%4, %5 : tensor<128x768xf32>, tensor<768x30522xf32>) outs(%8 : tensor<128x30522xf32>) -> tensor<128x30522xf32>
-        %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<128x30522xf32>, tensor<30522xf32>) outs(%7 : tensor<128x30522xf32>) {
-        ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
-          %11 = arith.addf %arg0, %arg1 : f32
-          linalg.yield %11 : f32
-        } -> tensor<128x30522xf32>
-        flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [128, 30522], strides = [1, 1] : tensor<128x30522xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x30522xf32>>
-        return
-      }
-    }
-  }
-}
-
-// The specialized workgroup should have vector operations.
-
-// CHECK-LABEL: func.func @forward_dispatch_116_matmul_128x30522x768
-//       CHECK: arith.cmpi eq
-//       CHECK: scf.if
-//       CHECK:   vector.transfer_read
-//       CHECK:   vector.transfer_read
-//       CHECK:   vector.contract
-//       CHECK:   vector.transfer_read
-//       CHECK:   vector.broadcast
-//       CHECK:   vector.transfer_write
-//       CHECK: else
-//   CHECK-NOT:   vector.transfer
-
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
-  #hal.descriptor_set.layout<0, bindings = [
-    #hal.descriptor_set.binding<0, storage_buffer>,
-    #hal.descriptor_set.binding<1, storage_buffer>,
-    #hal.descriptor_set.binding<2, storage_buffer>
-  ]>
-]>
-#map = affine_map<(d0) -> (d0)>
-#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
-hal.executable private @vectorized_dispatch_0 {
-  hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) {
-    hal.executable.export public @vectorized_dispatch_0_generic_102401 ordinal(0) layout(#pipeline_layout) {
-    ^bb0(%arg0: !hal.device, %arg1: index):
-      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1
-      hal.return %x, %y, %z : index, index, index
-    }
-    builtin.module {
-      func.func @vectorized_dispatch_0_generic_102401() {
-        %c0 = arith.constant 0 : index
-        %cst = arith.constant -3.000000e+00 : f32
-        %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<102401xf32>>
-        %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<102401xf32>>
-        %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<102401xf32>>
-        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [102401], strides = [1] : !flow.dispatch.tensor<readonly:tensor<102401xf32>> -> tensor<102401xf32>
-        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [102401], strides = [1] : !flow.dispatch.tensor<readonly:tensor<102401xf32>> -> tensor<102401xf32>
-        %5 = tensor.empty() : tensor<102401xf32>
-        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<102401xf32>, tensor<102401xf32>) outs(%5 : tensor<102401xf32>) {
-        ^bb0(%in: f32, %in_0: f32, %out: f32):
-          %7 = math.fma %cst, %in, %in_0 : f32
-          linalg.yield %7 : f32
-        } -> tensor<102401xf32>
-        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [102401], strides = [1] : tensor<102401xf32> -> !flow.dispatch.tensor<writeonly:tensor<102401xf32>>
-        return
-      }
-    }
-  }
-}
-
-// CHECK-LABEL: func.func @vectorized_dispatch_0_generic_102401
-//  CHECK-DAG:   %[[cst:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG:   %[[c256:.*]] = arith.constant 256 : index
-//  CHECK-DAG:   %[[c0:.*]] = arith.constant 0 : index
-//      CHECK:   %[[BLKX:.*]] = hal.interface.workgroup.id[0] : index
-//      CHECK:   %[[BLKX2:.*]] = affine.min #{{.+}}()[%[[BLKX]]]
-//      CHECK:   %[[CMP:.*]] = arith.cmpi eq, %[[BLKX2]], %[[c256]] : index
-//      CHECK:   scf.if %[[CMP]]
-//      CHECK:   %[[ARR:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%[[c0]]) : memref<102401xf32, #hal.descriptor_type<storage_buffer>>
-//      CHECK:   %[[ARR2:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%[[c0]]) : memref<102401xf32, #hal.descriptor_type<storage_buffer>>
-//      CHECK:   %[[TIDX:.*]] = gpu.thread_id  x
-//      CHECK:   %[[AFF:.*]] = affine.apply #{{.+}}(%[[TIDX]])[%[[BLKX]]]
-//      CHECK:   vector.transfer_read %[[ARR]][%[[AFF]]], %[[cst]] {in_bounds = [true]} : memref<102401xf32, #hal.descriptor_type<storage_buffer>>, vector<4xf32>
-//      CHECK:   vector.transfer_read %[[ARR2]][%[[AFF]]], %[[cst]] {in_bounds = [true]} : memref<102401xf32, #hal.descriptor_type<storage_buffer>>, vector<4xf32>