Add support for vectorizing depthwise convolution ops. (#8527)

This has around 5 - 10 % improvements for vision models on Pixel 4.
diff --git a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 3ca6445..726cdd8 100644
--- a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -605,15 +605,19 @@
       DispatchLoweringPassPipeline::CPUDoubleTilingExpert);
 }
 
-/// Sets the lowering configuration for linalg.conv_2d_nhwc_hwcf operations.
-static LogicalResult setRootConfig(
-    FuncOp entryPointFn, linalg::Conv2DNhwcHwcfOp convOp,
-    ArrayRef<LoopTilingAndDistributionInfo> tiledLoops) {
-  auto linalgOp = cast<linalg::LinalgOp>(convOp.getOperation());
+/// Sets the lowering configuration for linalg.conv_2d_nhwc_hwcf and
+/// linalg.depthwise_conv_2d_nhwc_hwc operations.
+static LogicalResult setConvRootConfig(
+    FuncOp entryPointFn, linalg::LinalgOp convOp,
+    ArrayRef<LoopTilingAndDistributionInfo> tiledLoops,
+    ArrayRef<int64_t> targetL1TileSizes, int64_t vectorSize) {
+  if (!isa<linalg::Conv2DNhwcHwcfOp, linalg::DepthwiseConv2DNhwcHwcOp>(
+          convOp.getOperation())) {
+    return failure();
+  }
+
   // Use the default distribution for the matmul loops.
-  unsigned numLoops = linalgOp.getNumLoops();
-  int64_t vectorSize =
-      getVectorSize(entryPointFn, convOp.getResult(0).getType());
+  unsigned numLoops = convOp.getNumLoops();
   SmallVector<int64_t> minTileSizes(numLoops, 1);
   SmallVector<int64_t> maxTileSizes(numLoops, defaultWorkgroupTileSize);
 
@@ -621,19 +625,17 @@
   OpBuilder builder(convOp.getContext());
   builder.setInsertionPoint(convOp);
   SmallVector<Range> iterationDomain =
-      linalgOp.createLoopRanges(builder, convOp.getLoc());
+      convOp.createLoopRanges(builder, convOp.getLoc());
   auto partitionableLoopsInterfaceOp =
       cast<IREE::Flow::PartitionableLoopsInterface>(convOp.getOperation());
   SmallVector<int64_t> flowTileSizes = getDefaultDistributedLevelTileSizes(
       iterationDomain, partitionableLoopsInterfaceOp, minTileSizes,
       maxTileSizes);
 
-  // Shapes of N, OH, OW, OC, KH, KW, IC
-  Optional<SmallVector<int64_t, 4>> shapes = linalgOp.getStaticLoopRanges();
-  assert(shapes.hasValue() &&
-         "something went wrong when inferring loop ranges");
-
-  SmallVector<int64_t> l1TileSizes = {1, 1, 8, vectorSize * 2, 1, 1, 8};
+  // Shapes of N, OH, OW, OC, KH, KW, (IC)
+  Optional<SmallVector<int64_t, 4>> shapes = convOp.getStaticLoopRanges();
+  SmallVector<int64_t> l1TileSizes(targetL1TileSizes.begin(),
+                                   targetL1TileSizes.end());
   for (auto i : llvm::seq<unsigned>(0, l1TileSizes.size())) {
     auto tileSize = flowTileSizes[i] ? flowTileSizes[i] : shapes.getValue()[i];
     // If the tile size is intended to be 1, do not adjust it to `vectorSize`.
@@ -643,7 +645,7 @@
     }
   }
   SmallVector<int64_t> vectorTileSizes;
-  splitParallelAndReductionTiles(linalgOp, l1TileSizes, vectorTileSizes);
+  splitParallelAndReductionTiles(convOp, l1TileSizes, vectorTileSizes);
 
   TileSizesListType tileSizes;
   tileSizes.push_back(flowTileSizes);
@@ -654,6 +656,30 @@
       DispatchLoweringPassPipeline::CPUConvTileAndDecomposeExpert);
 }
 
+static LogicalResult setRootConfig(
+    FuncOp entryPointFn, linalg::Conv2DNhwcHwcfOp convOp,
+    ArrayRef<LoopTilingAndDistributionInfo> tiledLoops) {
+  auto linalgOp = cast<linalg::LinalgOp>(convOp.getOperation());
+  int64_t vectorSize =
+      getVectorSize(entryPointFn, convOp.getResult(0).getType());
+  SmallVector<int64_t> l1TileSizes = {1, 1, 8, vectorSize * 2, 1, 1, 8};
+  return setConvRootConfig(entryPointFn, linalgOp, tiledLoops, l1TileSizes,
+                           vectorSize);
+}
+
+/// Sets the lowering configuration for linalg.depthwise_conv_2d_nhwc_hwc
+/// operations.
+static LogicalResult setRootConfig(
+    FuncOp entryPointFn, linalg::DepthwiseConv2DNhwcHwcOp convOp,
+    ArrayRef<LoopTilingAndDistributionInfo> tiledLoops) {
+  auto linalgOp = cast<linalg::LinalgOp>(convOp.getOperation());
+  int64_t vectorSize =
+      getVectorSize(entryPointFn, convOp.getResult(0).getType());
+  SmallVector<int64_t> l1TileSizes = {1, 1, 8, vectorSize * 2, 1, 3};
+  return setConvRootConfig(entryPointFn, linalgOp, tiledLoops, l1TileSizes,
+                           vectorSize);
+}
+
 /// Set default configuration for Linalg ops.
 static LogicalResult setRootConfig(
     FuncOp entryPointFn, linalg::LinalgOp linalgOp,
@@ -700,9 +726,10 @@
   auto setRootConfigFn = [&](Operation *op) -> LogicalResult {
     return TypeSwitch<Operation *, LogicalResult>(op)
         .Case<IREE::LinalgExt::FftOp, linalg::GenericOp, linalg::Mmt4DOp,
-              linalg::Conv2DNhwcHwcfOp>([&](auto op) {
-          return setRootConfig(entryPointFn, op, tiledLoops);
-        })
+              linalg::Conv2DNhwcHwcfOp, linalg::DepthwiseConv2DNhwcHwcOp>(
+            [&](auto op) {
+              return setRootConfig(entryPointFn, op, tiledLoops);
+            })
         .Case<linalg::ContractionOpInterface>([&](auto op) {
           return setRootConfig(entryPointFn, op, tiledLoops);
         })
diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
index dfa935a..593810f 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
@@ -625,8 +625,8 @@
     }
   }
 }
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 20, 40, 48, 0, 0]{{\]}}>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDefault>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 20, 40, 48, 0, 0], [1, 1, 8, 8, 0, 0], [0, 0, 0, 0, 1, 3]]>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
 //      CHECK: hal.executable.entry_point public @depthwise_conv_static
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK:     linalg.depthwise_conv_2d_nhwc_hwc
@@ -766,8 +766,8 @@
     }
   }
 }
-//   CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 1, 7, 64, 0, 0]{{\]}}>
-//   CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDefault>
+//   CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 1, 7, 64, 0, 0], [1, 1, 7, 8, 0, 0], [0, 0, 0, 0, 1, 1]]>
+//   CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
 //       CHECK: hal.executable.entry_point public @restrict_num_workgroups
 //  CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //       CHECK: linalg.depthwise_conv_2d_nhwc_hwc