Revert "[LLVMCPU][ArmSME] Add `2d-scalable-to-1d-scalable` pass" (#16963) Reverts openxla/iree#16712 Some builds that were not included in pre-commit are failing, lets revert until we can take a closer look. (see also https://github.com/openxla/iree/pull/16961)

diff --git a/compiler/src/iree/compiler/Codegen/Common/TileSizeSelection.cpp b/compiler/src/iree/compiler/Codegen/Common/TileSizeSelection.cpp
index b01d646..d5f12e4 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TileSizeSelection.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TileSizeSelection.cpp

@@ -52,106 +52,46 @@
   }
 };
 
-/// Returns the tiling level that contains the vector dim at `dimPos` (which is
-/// an index into the result of `getVectorTileSizes()`).
-unsigned TilingConfig::getTilingLevelForVectorDimPosition(unsigned dimPos) {
-  constexpr std::array vectorTilingLevels{VectorCommonParallelTiles,
-                                          VectorReductionTiles,
-                                          VectorInnerParallelTiles};
-  ArrayRef<TilingLevel> possibleLevels = vectorTilingLevels;
-  if (!hasVectorInnerParallelLevel())
-    possibleLevels = possibleLevels.drop_back();
-  std::optional<unsigned> foundLevel;
-  auto tilingLevels = loweringConfig.getTilingLevels();
-  for (TilingLevel level : possibleLevels) {
-    auto tilingLevelIndex = getActualLevel(level);
-    if (tilingLevels[tilingLevelIndex].getSizes()[dimPos] != 0) {
-      assert(!foundLevel.has_value() &&
-             "expected at most one tile size to be non-zero");
-      foundLevel = tilingLevelIndex;
-    }
-  }
-  assert(foundLevel.has_value() && "no vector size found for `dimPos`");
-  return *foundLevel;
-}
-
-/// Returns the tile size (size + scalability pair) at `index`. The
-/// `scalableFlags` can be empty.
-static std::pair<int64_t, bool> getTileSizeAtIndex(ArrayRef<int64_t> sizes,
-                                                   ArrayRef<bool> scalableFlags,
-                                                   unsigned index) {
-  return std::make_pair(sizes[index],
-                        index < scalableFlags.size() && scalableFlags[index]);
-}
-
 /// Returns the tile sizes of all the vector dimensions, including parallel
 /// and reduction dimensions.
 SizesAndScalableFlags TilingConfig::getVectorTileSizes() {
   unsigned numDims = getNumDimensions();
   SmallVector<int64_t> vectorSizes(numDims, 0);
   SmallVector<bool> scalableFlags(numDims, false);
-  auto tilingLevels = loweringConfig.getTilingLevels();
-  for (int dimPos = 0; dimPos < numDims; ++dimPos) {
-    unsigned dimTilingLevel = getTilingLevelForVectorDimPosition(dimPos);
-    std::tie(vectorSizes[dimPos], scalableFlags[dimPos]) = getTileSizeAtIndex(
-        tilingLevels[dimTilingLevel].getSizes(),
-        tilingLevels[dimTilingLevel].getScalableFlags(), dimPos);
+  auto [parallelCommonSizes, parallelCommonScalableFlags] =
+      getVectorCommonParallelSizes();
+  auto [reductionSizes, reductionScalableFlags] = getVectorReductionSizes();
+  SizesAndScalableFlags parallelInnerTiles;
+  if (hasVectorInnerParallelLevel()) {
+    parallelInnerTiles = getVectorInnerParallelSizes();
   }
+
+  for (int i = 0; i < numDims; ++i) {
+    SmallVector<bool> dimSizes;
+    dimSizes.push_back(!!parallelCommonSizes[i] ||
+                       parallelCommonScalableFlags[i]);
+    dimSizes.push_back(!!reductionSizes[i] || reductionScalableFlags[i]);
+    if (hasVectorInnerParallelLevel())
+      dimSizes.push_back(!!parallelInnerTiles.first[i] ||
+                         parallelInnerTiles.second[i]);
+
+    unsigned nonZeroCnt = llvm::count(dimSizes, true);
+    assert(nonZeroCnt <= 1 && "expected one tile size at most to be non-zero");
+    (void)nonZeroCnt;
+
+    vectorSizes[i] = parallelCommonSizes[i] ^ reductionSizes[i];
+    if (hasVectorInnerParallelLevel())
+      vectorSizes[i] ^= parallelInnerTiles.first[i];
+
+    scalableFlags[i] =
+        parallelCommonScalableFlags[i] || reductionScalableFlags[i];
+    if (hasVectorInnerParallelLevel())
+      scalableFlags[i] |= parallelInnerTiles.second[i];
+  }
+
   return std::make_pair(vectorSizes, scalableFlags);
 }
 
-/// Returns a new `LoweringConfigAttr`, with the tile sizes of vector
-/// dimensions, set to `sizes`, and the corresponding scalability set to
-/// `scalableFlags`.
-IREE::Codegen::LoweringConfigAttr
-TilingConfig::getLoweringConfigWithNewVectorSizes(
-    ArrayRef<int64_t> sizes, ArrayRef<bool> scalableFlags) {
-  unsigned numDims = getNumDimensions();
-  assert(sizes.size() == numDims &&
-         "expected `sizes` to match number of dimensions");
-  assert((scalableFlags.empty() || scalableFlags.size() == numDims) &&
-         "expected `scalableFlags` to match "
-         "number of dimensions (or be empty)");
-
-  // Make a map from tiling levels to vector dims at that level.
-  std::array<SmallVector<unsigned, 4>, MaxNumTileLevels> tilingLevelToDimsMap;
-  for (unsigned dimPos = 0; dimPos < numDims; ++dimPos) {
-    auto tilingLevelIndex = getTilingLevelForVectorDimPosition(dimPos);
-    tilingLevelToDimsMap[tilingLevelIndex].push_back(dimPos);
-  }
-
-  MLIRContext *context = loweringConfig.getContext();
-  auto tilingLevels = loweringConfig.getTilingLevels();
-  SmallVector<IREE::Codegen::LoweringConfigTilingLevelAttr> newTilingLevelsList(
-      tilingLevels.begin(), tilingLevels.end());
-
-  // For each vector tiling level:
-  for (auto [tilingLevelIndex, tilingLevelDims] :
-       llvm::enumerate(tilingLevelToDimsMap)) {
-    if (tilingLevelDims.empty())
-      continue;
-    auto level = tilingLevels[tilingLevelIndex];
-    SmallVector<int64_t> newSizes(level.getSizes());
-    SmallVector<bool> newScalableFlags(level.getScalableFlags());
-    newScalableFlags.resize(numDims);
-    // 1. Update all the vector sizes within that tiling level.
-    for (unsigned dimPos : tilingLevelDims) {
-      std::tie(newSizes[dimPos], newScalableFlags[dimPos]) =
-          getTileSizeAtIndex(sizes, scalableFlags, dimPos);
-    }
-    // 2. Then create a new tiling level attribute for that level.
-    auto newLevel = IREE::Codegen::LoweringConfigTilingLevelAttr::get(
-        context, newSizes, level.getInterchange(), newScalableFlags);
-    newTilingLevelsList[tilingLevelIndex] = newLevel;
-  }
-
-  // Create a new `lowering_config` attribute.
-  auto newTilingLevels = IREE::Codegen::LoweringConfigTilingLevelsAttr::get(
-      context, newTilingLevelsList);
-  return IREE::Codegen::LoweringConfigAttr::get(
-      context, newTilingLevels, loweringConfig.getNativeVectorSize());
-}
-
 /// Returns a list with the tiling levels that can be fused for this
 /// configuration.
 SmallVector<int64_t> TilingConfig::getFusableLevels() {

diff --git a/compiler/src/iree/compiler/Codegen/Common/TileSizeSelection.h b/compiler/src/iree/compiler/Codegen/Common/TileSizeSelection.h
index 5ac2fdf..ef2a138 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TileSizeSelection.h
+++ b/compiler/src/iree/compiler/Codegen/Common/TileSizeSelection.h

@@ -109,21 +109,10 @@
     return getVectorSizesForLevel(getVectorInnerParallelLevel());
   }
 
-  /// Returns the tiling level that contains the vector dim at `dimPos` (which
-  /// is an index into the result of `getVectorTileSizes()`).
-  unsigned getTilingLevelForVectorDimPosition(unsigned dimPos);
-
   /// Returns the tile sizes of all the vector dimensions, including parallel
   /// and reduction dimensions.
   SizesAndScalableFlags getVectorTileSizes();
 
-  /// Returns a new `LoweringConfigAttr`, with the tile sizes of vector
-  /// dimensions, set to `sizes`, and the corresponding scalability set to
-  /// `scalableFlags`.
-  IREE::Codegen::LoweringConfigAttr
-  getLoweringConfigWithNewVectorSizes(ArrayRef<int64_t> sizes,
-                                      ArrayRef<bool> scalableFlags);
-
   /// Returns a list with the tiling levels that can be fused for this
   /// configuration.
   SmallVector<int64_t> getFusableLevels();

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
index 62f1208..d030294 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel

@@ -52,7 +52,6 @@
         "DispatchABI.cpp",
         "ExpandF16OpToF32Pass.cpp",
         "KernelDispatch.cpp",
-        "LLVMCPU2DScalableTo1DScalable.cpp",
         "LLVMCPUAssignConstantOrdinals.cpp",
         "LLVMCPUAssignImportOrdinals.cpp",
         "LLVMCPUCheckIRBeforeLLVMConversion.cpp",

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
index abbf5c0..f794896 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt

@@ -53,7 +53,6 @@
     "DispatchABI.cpp"
     "ExpandF16OpToF32Pass.cpp"
     "KernelDispatch.cpp"
-    "LLVMCPU2DScalableTo1DScalable.cpp"
     "LLVMCPUAssignConstantOrdinals.cpp"
     "LLVMCPUAssignImportOrdinals.cpp"
     "LLVMCPUCheckIRBeforeLLVMConversion.cpp"

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPU2DScalableTo1DScalable.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPU2DScalableTo1DScalable.cpp
deleted file mode 100644
index 7ae37a4..0000000
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPU2DScalableTo1DScalable.cpp
+++ /dev/null

@@ -1,175 +0,0 @@
-// Copyright 2024 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/Common/TileSizeSelection.h"
-#include "iree/compiler/Codegen/LLVMCPU/PassDetail.h"
-#include "iree/compiler/Codegen/LLVMCPU/Passes.h"
-#include "iree/compiler/Codegen/LLVMCPU/Utils.h"
-#include "iree/compiler/Codegen/Utils/Utils.h"
-#include "mlir/Dialect/SCF/Utils/Utils.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir::iree_compiler {
-
-namespace {
-
-/// Currently, IREE requires `lowering_config`s to be propagated to all compute
-/// ops within a dispatch region. This can be problematic for SME which only
-/// supports 2D scalable outer product operations -- if an operation cannot be
-/// lowered to an outer product, we can only scalably vectorize it in one
-/// dimension.
-///
-/// The solution here is this pass (`2d-scalable-to-1d-scalable`) that runs just
-/// before vectorization, that drops unsupported scalable tile/vector sizes,
-/// producing loops of ops that will only be vectorized scalably in one
-/// dimension. This allows earlier passes like `tile-and-fuse` to still function
-/// correctly.
-///
-/// Take this simple example:
-///
-/// ```mlir
-/// // Lowering configs propagated (from matmul):
-/// linalg.fill {lowering_config = [[4], [4]]
-/// linalg.matmul {lowering_config = [[4], [4], 1]
-/// linalg.generic {lowering_config = [[4], [4]]
-/// ```
-/// Here the `linalg.generic` cannot be vectorized with 2D scalable vectors.
-///
-/// After `tile-and-fuse` (which requires consistent lowering configs):
-/// ```mlir
-/// scf.for i in range(0, 1000) step 4 x vscale {
-///   scf.for j in range(0, 2000) step 4 x vscale {
-///     linalg.fill {lowering_config = [[4], [4]]
-///     for k in range(0, 100) step 1 {
-///       linalg.matmul {lowering_config = [[4], [4], 1]
-///     }
-///     // 2D scalable vectorization unsupported here:
-///     linalg.generic {lowering_config = [[4], [4]]
-///   }
-/// }
-/// ```
-///
-/// Unsupported scalability removed (by `2d-scalable-to-1d-scalable`):
-/// ```mlir
-/// scf.for i in range(0, 1000) step 4 x vscale {
-///   scf.for j in range(0, 2000) step 4 x vscale {
-///     linalg.fill {lowering_config = [[4], [4]]
-///     for k in range(0, 100) step 1 {
-///       linalg.matmul {lowering_config = [[4], [4], 1]
-///     }
-///     // Insert a new loop:
-///     for n in range(0, 4 x vscale) step 4 {
-///        // Drop a scalable dim:
-///        linalg.generic {lowering_config = [4, [4]]
-///     }
-///   }
-/// }
-/// ```
-///
-/// This can now be vectorized and lowered successfully, which produces a
-/// dispatch that mixes SME and SVE.
-class LLVMCPU2DScalableTo1DScalablePass
-    : public LLVMCPU2DScalableTo1DScalableBase<
-          LLVMCPU2DScalableTo1DScalablePass> {
-public:
-  using LLVMCPU2DScalableTo1DScalableBase::LLVMCPU2DScalableTo1DScalableBase;
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry
-        .insert<arith::ArithDialect, linalg::LinalgDialect, scf::SCFDialect>();
-  }
-
-  void runOnOperation() override;
-};
-
-static bool opKnownToSupport2DScalableVectorizationWithArmSME(Operation *op) {
-  return isa<linalg::MatmulOp, linalg::MatmulTransposeAOp, linalg::FillOp>(op);
-}
-
-// Note: It would be easy to parameterize this rewrite to convert N-D scalable
-// operations to M-D scalable ones (where M < N). However this is currently not
-// needed.
-static LogicalResult
-dropScalabilityFromUnsupportedOperations(mlir::FunctionOpInterface funcOp,
-                                         bool assumeArmSME = false) {
-  // Note: Which operations should have scalability dropped is specific to
-  // ArmSME. The rest of this rewrite could be generic (though currently
-  // there's no other targets that support > 1D scalability).
-  auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
-  bool isArmSME = assumeArmSME || hasSMEFeature(targetAttr);
-
-  if (!isArmSME)
-    return success();
-
-  SmallVector<TilingInterface> computeOps;
-  funcOp.walk([&](TilingInterface op) {
-    if (!opKnownToSupport2DScalableVectorizationWithArmSME(op))
-      computeOps.push_back(op);
-  });
-
-  for (TilingInterface tilingOp : computeOps) {
-    auto loweringConfigAttr = getLoweringConfig(tilingOp);
-    if (!loweringConfigAttr)
-      continue;
-
-    TilingConfig tilingConfig(loweringConfigAttr);
-    auto [vectorSizes, scalableFlags] = tilingConfig.getVectorTileSizes();
-    auto numScalableDims = llvm::count(scalableFlags, true);
-
-    if (numScalableDims <= 1)
-      continue;
-
-    SmallVector<int64_t> loopTileSizes;
-    SmallVector<bool> newScalableFlags;
-    for (auto [flag, size] : llvm::zip_equal(scalableFlags, vectorSizes)) {
-      if (flag && numScalableDims >= 2) {
-        --numScalableDims;
-        loopTileSizes.push_back(size);
-        newScalableFlags.push_back(false);
-      } else {
-        loopTileSizes.push_back(0);
-        newScalableFlags.push_back(flag);
-      }
-    }
-
-    IRRewriter rewriter(tilingOp->getContext());
-    rewriter.setInsertionPoint(tilingOp);
-
-    // 2. Re-tile the operation with some scalability dropped. This introduces
-    // loops for previously scalable vector/tile sizes.
-    scf::SCFTilingOptions options;
-    setSCFTileSizes(options, tilingOp, loopTileSizes, /*tileScalableFlags=*/{});
-    auto tilingResult = scf::tileUsingSCF(rewriter, tilingOp, options);
-    if (failed(tilingResult))
-      return failure();
-
-    // 3. Update the lowering config of the new tiled operations.
-    auto newLoweringConfig = tilingConfig.getLoweringConfigWithNewVectorSizes(
-        vectorSizes, newScalableFlags);
-    for (auto *newOp : tilingResult->tiledOps) {
-      if (isa<TilingInterface>(newOp))
-        setLoweringConfig(newOp, newLoweringConfig);
-    }
-
-    rewriter.replaceOp(tilingOp, tilingResult->replacements);
-  }
-  return success();
-}
-
-void LLVMCPU2DScalableTo1DScalablePass::runOnOperation() {
-  if (failed(dropScalabilityFromUnsupportedOperations(getOperation(),
-                                                      assumeArmSME)))
-    signalPassFailure();
-}
-
-} // namespace
-
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMCPU2DScalableTo1DScalablePass() {
-  return std::make_unique<LLVMCPU2DScalableTo1DScalablePass>();
-}
-
-} // namespace mlir::iree_compiler

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 2701f51..a577d47 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp

@@ -397,11 +397,6 @@
     nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUPeelPass());
   }
 
-  if (pipelineOpt.enableAArch64SSVE) {
-    nestedModulePM.addNestedPass<func::FuncOp>(
-        createLLVMCPU2DScalableTo1DScalablePass());
-  }
-
   {
     nestedModulePM.addNestedPass<func::FuncOp>(createVectorizePadPass());
     nestedModulePM.addNestedPass<func::FuncOp>(

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h
index b7aa798..22714cf 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h

@@ -67,9 +67,6 @@
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
 createLLVMCPUTileAndFusePass(int64_t tilingLevel = -1);
 
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMCPU2DScalableTo1DScalablePass();
-
 /// Pass to tile TilingInterface ops with given tilingLevel.
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
 createLLVMCPUTilePass(int64_t tilingLevel = -1);

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td
index 7969416..c4f5884 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td

@@ -139,19 +139,6 @@
   ];
 }
 
-// Note: This pass is currently only required when targeting Arm SME (which is
-// the only target that currently has some concept of 2D scalability).
-def LLVMCPU2DScalableTo1DScalable :
-    InterfacePass<"iree-llvmcpu-2d-scalable-to-1d-scalable", "mlir::FunctionOpInterface"> {
-  let summary = "Pass to replace unsupported scalable dimensions with loops.";
-  let constructor =
-      "mlir::iree_compiler::createLLVMCPU2DScalableTo1DScalablePass()";
-  let options = [
-    Option<"assumeArmSME", "assume-arm-sme", "bool", /*default=*/"false",
-      "Assume the current target is ArmSME (used for testing)">
-  ];
-}
-
 def LLVMCPUUnfuseFMAOps :
     InterfacePass<"iree-llvmcpu-unfuse-fma-pass", "mlir::FunctionOpInterface"> {
   let summary = "Convert llvm.fma into unfused mulf and addf ops";

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/2d-scalable-to-1d-scalable.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/2d-scalable-to-1d-scalable.mlir
deleted file mode 100644
index cbf25db..0000000
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/2d-scalable-to-1d-scalable.mlir
+++ /dev/null

@@ -1,88 +0,0 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-2d-scalable-to-1d-scalable{assume-arm-sme=true},cse))" --split-input-file %s | FileCheck %s
-
-#compute_config = #iree_codegen.lowering_config<tile_sizes = [[0, 0], [[4], [4]], [0, 0], [0, 0]]>
-#matmul_config = #iree_codegen.lowering_config<tile_sizes = [[0, 0, 0], [[4], [4], 0], [0, 0, 1], [0, 0, 0]]>
-#dim_0_map = affine_map<(d0)[s0] -> (-d0 + 32400, s0)>
-#dim_1_map = affine_map<(d0)[s0] -> (-d0 + 16, s0)>
-
-// Here's an example from a dispatch where a matmul has been given a 2D-scalable
-// lowering config (#matmul_config) for ArmSME. That config has been propagated
-// to compute ops within that same dispatch as (#compute_config).
-//
-// This is okay for the linalg.fill but the linalg.generic cannot be lowered
-// to make use of 2D scalable vectors. ArmSME only supports 2D scalable outer
-// products, so if it's not an outer product, we can only scalably vectorize in
-// one dimension.
-//
-// The initial tile-and-fuse pass requires lowering configs to be consistent,
-// so we keep the keep the lowering_configs unchanged until after that pass.
-//
-// 2d-scalable-to-1d-scalable can then remove unsupported scalable
-// dimensions, and introduce loops. This results in dispatches that fuse both
-// SME and SVE.
-
-// Extracted from an IR dump after iree-llvmcpu-tile-and-fuse:
-func.func @scalable_2d_matmul_and_generic(%arg0: tensor<32400x32xf32>, %arg1: tensor<32x16xf32>, %arg2: tensor<32400x16xf32>, %arg3: tensor<16xf32>) -> tensor<32400x16xf32> {
-  %c0 = arith.constant 0 : index
-  %c4 = arith.constant 4 : index
-  %c16 = arith.constant 16 : index
-  %c32400 = arith.constant 32400 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = vector.vscale
-  %1 = arith.muli %0, %c4 : index
-  %2 = scf.for %arg4 = %c0 to %c32400 step %1 iter_args(%arg5 = %arg2) -> (tensor<32400x16xf32>) {
-    %3 = scf.for %arg6 = %c0 to %c16 step %1 iter_args(%arg7 = %arg5) -> (tensor<32400x16xf32>) {
-      %4 = affine.min #dim_0_map(%arg4)[%1]
-      %5 = affine.min #dim_1_map(%arg6)[%1]
-      %extracted_slice = tensor.extract_slice %arg0[%arg4, 0] [%4, 32] [1, 1] : tensor<32400x32xf32> to tensor<?x32xf32>
-      %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg6] [32, %5] [1, 1] : tensor<32x16xf32> to tensor<32x?xf32>
-      %6 = tensor.empty(%4, %5) : tensor<?x?xf32>
-      %7 = linalg.fill {lowering_config = #compute_config}
-        ins(%cst : f32) outs(%6 : tensor<?x?xf32>) -> tensor<?x?xf32>
-      %8 = linalg.matmul {lowering_config = #matmul_config}
-        ins(%extracted_slice, %extracted_slice_0 : tensor<?x32xf32>, tensor<32x?xf32>)
-        outs(%7 : tensor<?x?xf32>) -> tensor<?x?xf32>
-      %extracted_slice_1 = tensor.extract_slice %arg3[%arg6] [%5] [1] : tensor<16xf32> to tensor<?xf32>
-      %extracted_slice_2 = tensor.extract_slice %arg7[%arg4, %arg6] [%4, %5] [1, 1] : tensor<32400x16xf32> to tensor<?x?xf32>
-      %9 = linalg.generic {
-        indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>],
-        iterator_types = ["parallel", "parallel"]}
-        ins(%8, %extracted_slice_1 : tensor<?x?xf32>, tensor<?xf32>)
-        outs(%extracted_slice_2 : tensor<?x?xf32>) attrs =  {lowering_config = #compute_config} {
-      ^bb0(%in: f32, %in_3: f32, %out: f32):
-        %10 = arith.mulf %in, %in_3 : f32
-        linalg.yield %10 : f32
-      } -> tensor<?x?xf32>
-      %inserted_slice = tensor.insert_slice %9 into %arg7[%arg4, %arg6] [%4, %5] [1, 1] : tensor<?x?xf32> into tensor<32400x16xf32>
-      scf.yield %inserted_slice : tensor<32400x16xf32>
-    }
-    scf.yield %3 : tensor<32400x16xf32>
-  }
-  return %2 : tensor<32400x16xf32>
-}
-// CHECK: #[[FILL_CONFIG:.*]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0], {{\[}}[4], [4]], [0, 0], [0, 0]]>
-// CHECK: #[[MATMUL_CONFIG:.*]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0], {{\[}}[4], [4], 0], [0, 0, 1], [0, 0, 0]]>
-// CHECK: #[[GENERIC_CONFIG:.*]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0], [4, [4]], [0, 0], [0, 0]]>
-//
-//      CHECK: func.func @scalable_2d_matmul_and_generi
-//      CHECK:   %[[C4:.*]] = arith.constant 4 : index
-//      CHECK:   %[[VSCALE:.*]] = vector.vscale
-//      CHECK:   %[[C4_VSCALE:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
-//      CHECK:   scf.for
-// CHECK-SAME:    step %[[C4_VSCALE]]
-// CHECK-SAME:   {
-//      CHECK:     scf.for
-// CHECK-SAME:      step %[[C4_VSCALE]]
-// CHECK-SAME:     {
-//      CHECK:       linalg.fill
-// CHECK-SAME:         lowering_config = #[[FILL_CONFIG]]
-//      CHECK:       linalg.matmul
-// CHECK-SAME:         lowering_config = #[[MATMUL_CONFIG]]
-//      CHECK:       scf.for
-// CHECK-SAME:        step %[[C4]]
-// CHECK-SAME:       {
-//      CHECK:         linalg.generic
-// CHECK-SAME:           lowering_config = #[[GENERIC_CONFIG]]
-//      CHECK:       }
-//      CHECK:     }
-//      CHECK:   }

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
index 6a2c1d8..c0fae3f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel

@@ -19,7 +19,6 @@
     srcs = enforce_glob(
         # keep sorted
         [
-            "2d-scalable-to-1d-scalable.mlir",
             "aarch64_dotprod_vector_lowering.mlir",
             "aarch64_vector_lowering.mlir",
             "apply_scale_lowering.mlir",

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
index 94e9be2..ad2ee35 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt

@@ -14,7 +14,6 @@
   NAME
     lit
   SRCS
-    "2d-scalable-to-1d-scalable.mlir"
     "aarch64_dotprod_vector_lowering.mlir"
     "aarch64_vector_lowering.mlir"
     "apply_scale_lowering.mlir"