Extend DecomposeConvolutionToLowerDimOpsPass (#17069) Updates `DecomposeConvolutionToLowerDimOpsPass` so that when a convolution Op is decomposed, the lowering config attribute is updated ("decomposed") accordingly. This change leverages the fact that we know which dimension is being "removed" when the decomposition happens. The corresponding dim/size is simply deleted when updating the lowering config. With the logic added in this PR, we are making sure that `DecomposeConvolutionToLowerDimOpsPass` will no longer "drop" the lowering config. This is beneficial - it removes the need to restore the tiling config (for e.g. the vectoriser) from the loop nest. BEFORE `DecomposeConvolutionToLowerDimOpsPass`: ```mlir #config = #iree_codegen.lowering_config<tile_sizes = [[0, 0, 0, 0, 0, 0], [1, 1, 1, 4, 0, 0], [0, 0, 0, 0, 1, 4], [0, 0, 0, 0, 0, 0]]> linalg.depthwise_conv_2d_nhwc_hwc lowering_config = #config ``` AFTER `DecomposeConvolutionToLowerDimOpsPass` _without_ this change: ```mlir linalg.depthwise_conv_1d_nwc_wc ``` AFTER `DecomposeConvolutionToLowerDimOpsPass` _with_ this change: ```mlir #config = #iree_codegen.lowering_config<tile_sizes = [[0, 0, 0, 0], [1, 1, 4, 0], [0, 0, 0, 1], [0, 0, 0, 0]]> linalg.depthwise_conv_1d_nwc_wc lowering_config = #config ``` ATM, the logic implemented here is limited to deptwhise HWC convolutions. It can easily be extended to other Convs when that's required. For simplicity, it is also assumed that there's only one Conv Op per function. This seems like the most common case. Adding support for multiple conv ops per function is left as a TODO. It shouldn't be too difficult to implement - it will require building a map of all convs/lowering configs.

diff --git a/compiler/src/iree/compiler/Codegen/Common/DecomposeConvolutionToLowerDimOps.cpp b/compiler/src/iree/compiler/Codegen/Common/DecomposeConvolutionToLowerDimOps.cpp
index bc9333e..e6640a6 100644
--- a/compiler/src/iree/compiler/Codegen/Common/DecomposeConvolutionToLowerDimOps.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/DecomposeConvolutionToLowerDimOps.cpp

@@ -6,7 +6,9 @@
 
 #include "iree/compiler/Codegen/Common/PassDetail.h"
 #include "iree/compiler/Codegen/Common/Passes.h"
+#include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
@@ -16,6 +18,108 @@
 
 namespace {
 
+static bool foldHDim(linalg::DepthwiseConv2DNhwcHwcOp convOp) {
+  Value kernel = convOp.getInputs().back();
+  Value output = convOp.getOutputs().front();
+
+  auto kernelType = dyn_cast<RankedTensorType>(kernel.getType());
+  auto outputType = dyn_cast<RankedTensorType>(output.getType());
+
+  auto kernelShape = kernelType.getShape();
+  auto outputShape = outputType.getShape();
+
+  int64_t khSize = kernelShape[0];
+  int64_t ohSize = outputShape[1];
+  bool removeH = (khSize == 1 && ohSize == 1);
+
+  return removeH;
+}
+
+/// Computes a "decomposed" lowering config attribute for a conv OP
+///
+/// This method complements the patterns to decompose 2D convolutions into 1D
+/// convs. Specifically, it will update the lowering config attached to a Conv
+/// Op in a way that matches the "decomposition" patterns.
+///
+/// At the moment only Depthwise HWC convolutions are supported.
+static FailureOr<IREE::Codegen::LoweringConfigAttr>
+computeDecomposedLoweringConfig(ArrayRef<Operation *> computeOps,
+                                MLIRContext *context) {
+
+  // 0.1 Double-check that there's only one convolution Op.
+  // TODO: Make this hook work with multiple conv Ops
+  assert(llvm::count_if(computeOps,
+                        [](Operation *op) {
+                          return isa<linalg::ConvolutionOpInterface>(op);
+                        }) == 1 &&
+         "Exactly 1 Linalg Conv Op is expected");
+
+  // 1. Get the conv Op to update
+  // ATM only 2D depthwise HWC convs are supported.
+  // TODO: Add support for other convs
+  linalg::DepthwiseConv2DNhwcHwcOp convOp;
+  for (auto op : computeOps) {
+    if (isa<linalg::DepthwiseConv2DNhwcHwcOp>(op)) {
+      convOp = cast<linalg::DepthwiseConv2DNhwcHwcOp>(op);
+      break;
+    }
+  }
+
+  if (!convOp) {
+    return failure();
+  }
+
+  // ATM only folding of the H dim is supported.
+  // TODO: Add support for cases where the W dim is folded.
+  if (!foldHDim(convOp))
+    return failure();
+
+  // 2. Get the current lowering config attached to the Conv Op.
+  FailureOr<IREE::Codegen::LoweringConfigAttr> loweringConfigAttr =
+      getLoweringConfig(computeOps);
+  if (failed(loweringConfigAttr))
+    return failure();
+
+  // TODO: Either remove "interchange" from lowering_config or add support in
+  // this pass.
+  if (!loweringConfigAttr->isInterchangeEmpty())
+    return failure();
+
+  // 3. Calculate new tiling levels.
+  // Note that this will basically erase the _H_ dims from the orignal lowering
+  // config.
+  auto dims = linalg::inferConvolutionDims(convOp);
+  SmallVector<unsigned> hDimsToErase = {dims->outputImage[0],
+                                        dims->filterLoop[0]};
+  llvm::sort(hDimsToErase, [](auto x, auto y) { return x > y; });
+
+  SmallVector<IREE::Codegen::LoweringConfigTilingLevelAttr> newTilingLevelsList;
+  for (auto level : loweringConfigAttr.value().getTilingLevels()) {
+    SmallVector<int64_t> newSizes(level.getSizes());
+    SmallVector<bool> newScalableFlags(level.getScalableFlags());
+
+    llvm::for_each(hDimsToErase, [&newSizes](unsigned idx) {
+      newSizes.erase(newSizes.begin() + idx);
+    });
+    if (newScalableFlags.size() > 0) {
+      llvm::for_each(hDimsToErase, [&newScalableFlags](unsigned idx) {
+        newScalableFlags.erase(newScalableFlags.begin() + idx);
+      });
+    }
+
+    auto newLevel = IREE::Codegen::LoweringConfigTilingLevelAttr::get(
+        context, newSizes, /*interchange=*/{}, newScalableFlags);
+    newTilingLevelsList.push_back(newLevel);
+  }
+
+  // 4. Create and return a new lowering config attribute.
+  auto newTilingLevels = IREE::Codegen::LoweringConfigTilingLevelsAttr::get(
+      context, newTilingLevelsList);
+  return IREE::Codegen::LoweringConfigAttr::get(
+      context, newTilingLevels,
+      loweringConfigAttr.value().getNativeVectorSize());
+}
+
 class DecomposeConvolutionToLowerDimOpsPass
     : public DecomposeConvolutionToLowerDimOpsBase<
           DecomposeConvolutionToLowerDimOpsPass> {
@@ -24,12 +128,43 @@
   }
   void runOnOperation() override {
     MLIRContext *context = &getContext();
+    auto funcOp = dyn_cast<func::FuncOp>(getOperation());
+    auto computeOps = getComputeOps(funcOp);
+
+    // 1. If there's exactly 1 conv in this function (most common case),
+    // compute the "decomposed" version of its lowering config attribute.
+    // TODO: Add support for cases with multiple convs per function
+    int64_t numConvOps = llvm::count_if(computeOps, [](Operation *op) {
+      return isa<linalg::ConvolutionOpInterface>(op);
+    });
+
+    if (numConvOps == 0) {
+      return;
+    }
+
+    FailureOr<IREE::Codegen::LoweringConfigAttr> newLoweringConfig;
+    if (numConvOps == 1) {
+      newLoweringConfig = computeDecomposedLoweringConfig(computeOps, context);
+    }
+
+    // 2. Run the patterns. This is the key part of this pass.
     RewritePatternSet patterns(context);
     linalg::populateDecomposeConvolutionPatterns(patterns);
     if (failed(applyPatternsAndFoldGreedily(getOperation(),
                                             std::move(patterns)))) {
       return signalPassFailure();
     }
+
+    // 3. If there's exactly 1 conv in this function (most common case), attach
+    // a "decomposed" lowering config created earlier to the newly decomposed
+    // conv Op.
+    if (numConvOps == 1 && succeeded(newLoweringConfig)) {
+      auto computeOps = getComputeOps(funcOp);
+      for (auto computeOp : computeOps) {
+        if (isa<linalg::DepthwiseConv1DNwcWcOp>(computeOp))
+          setLoweringConfig(computeOp, newLoweringConfig.value());
+      }
+    }
   }
 };
 

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
index da43167..2d09efc 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel

@@ -31,6 +31,7 @@
             "erase_dead_alloc_and_stores.mlir",
             "decompose_affine_ops.mlir",
             "decompose_batch_mmt4d_ops.mlir",
+            "decompose_conv2d.mlir",
             "decompose_linalg_generic.mlir",
             "decompose_pack_unpack_ops.mlir",
             "decompose_softmax.mlir",

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
index 1952a24..df9fd29 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt

@@ -26,6 +26,7 @@
     "convolutions.mlir"
     "decompose_affine_ops.mlir"
     "decompose_batch_mmt4d_ops.mlir"
+    "decompose_conv2d.mlir"
     "decompose_linalg_generic.mlir"
     "decompose_pack_unpack_ops.mlir"
     "decompose_softmax.mlir"

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/decompose_conv2d.mlir b/compiler/src/iree/compiler/Codegen/Common/test/decompose_conv2d.mlir
new file mode 100644
index 0000000..bba473f
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/test/decompose_conv2d.mlir

@@ -0,0 +1,26 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-decompose-convolution-to-lower-dim-ops))" --split-input-file %s | FileCheck %s
+
+#config = #iree_codegen.lowering_config<tile_sizes = [[0, 0, 0, 0, 0, 0], [1, 1, 1, 4, 0, 0], [0, 0, 0, 0, 1, 4], [0, 0, 0, 0, 0, 0]]>
+#executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}>
+#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
+module {
+  func.func @restrict_num_workgroups() attributes {hal.executable.target = #executable_target_system_elf_arm_64_, translation_info = #translation} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x1x4x4xf32>>
+    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x4x4xf32>>
+    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x1x1x4xf32>>
+    %input = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 1, 4, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x4x4xf32>> -> tensor<1x1x4x4xf32>
+    %filter = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1, 4, 4], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x4xf32>> -> tensor<1x4x4xf32>
+    %5 = tensor.empty() : tensor<1x1x1x4xf32>
+    %output = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x1x1x4xf32>) -> tensor<1x1x1x4xf32>
+    %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #config,
+            strides = dense<1> : tensor<2xi64>} ins(%input, %filter : tensor<1x1x4x4xf32>, tensor<1x4x4xf32>) outs(%output : tensor<1x1x1x4xf32>) -> tensor<1x1x1x4xf32>
+    flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1] : tensor<1x1x1x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x1x1x4xf32>>
+    return
+  }
+}
+
+//   CHECK: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0, 0], [1, 1, 4, 0], [0, 0, 0, 4], [0, 0, 0, 0]]>
+//   CHECK:    linalg.depthwise_conv_1d_nwc_wc 
+//   CHECK-SAME: lowering_config = #[[CONFIG]]
+//   CHECK-SAME: ins({{.*}}, {{.*}} : tensor<1x4x4xf32>, tensor<4x4xf32>) outs({{.*}} : tensor<1x1x4xf32>) -> tensor<1x1x4xf32>

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
index 945cad7..6d4d130 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp

@@ -287,6 +287,12 @@
   return SmallVector<int64_t>(levels[level].getInterchange());
 }
 
+bool LoweringConfigAttr::isInterchangeEmpty() {
+  return llvm::none_of(getTilingLevels(), [](auto level) {
+    return !level.getInterchange().empty();
+  });
+}
+
 LogicalResult
 LoweringConfigAttr::verify(function_ref<InFlightDiagnostic()> emitError,
                            LoweringConfigTilingLevelsAttr levels,

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td
index 7b1d172..bc02ae5 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td

@@ -251,6 +251,10 @@
     // Returns the tile interchange for a level set for the op.
     SmallVector<int64_t> getTileInterchangeVals(unsigned level);
 
+    // Returns true if there are no tile interchange values (this means that
+    // interchange can be ignored).
+    bool isInterchangeEmpty();
+
     // Returns the native vector size to use.
     SmallVector<int64_t> getNativeVectorSizeVals() {
       return SmallVector<int64_t>(getNativeVectorSize());