Add support for vectorizing normal convolution ops. (#8460)

This PR has slight improvements in MobileNet V3 and V2. For other cases, they may be noises. For PoseNet, there are regressions in multi-threaded. It's just a matter of configuration tuning, so it's fine at this moment.

Configurations: taskset 80 + dylib-sync on local Pixel 4

| Model            | Before  | After   |
| ---------------- | ------- | ------- |
| DeepLabV3        | 705 ms  | 704 ms  |
| MobileBertSquad  | 669 ms  | 673 ms  |
| MobileNetV2      | 104 ms  | 103 ms  |
| MobileNetV3Small | 25.4 ms | 24.7 ms |
| MobileSSD        | 205 ms  | 201 ms  |
| PoseNet          | 546 ms  | 545 ms  |

Configurations: taskset f0 + dylib on local Pixel 4

| Model            | Before  | After   |
| ---------------- | ------- | ------- |
| DeepLabV3        | 371 ms  | 374 ms  |
| MobileBertSquad  | 375 ms  | 374 ms  |
| MobileNetV2      | 66.1 ms  | 64.5 ms  |
| MobileNetV3Small | 21.4 ms | 20.9 ms |
| MobileSSD        | 92.7 ms  | 91.2 ms  |
| PoseNet          | 189 ms  | 202 ms  |
diff --git a/iree/compiler/Codegen/Dialect/LoweringConfig.td b/iree/compiler/Codegen/Dialect/LoweringConfig.td
index 204eaee..44179a6 100644
--- a/iree/compiler/Codegen/Dialect/LoweringConfig.td
+++ b/iree/compiler/Codegen/Dialect/LoweringConfig.td
@@ -16,6 +16,8 @@
     : StrEnumAttrCase<"CPUSingleTilingExpert">;
 def CPU_DoubleTilingExpert
     : StrEnumAttrCase<"CPUDoubleTilingExpert">;
+def CPU_ConvTileAndDecomposeExpert
+    : StrEnumAttrCase<"CPUConvTileAndDecomposeExpert">;
 def CPU_TileFuseAndVectorize
     : StrEnumAttrCase<"CPUTileFuseAndVectorize">;
 
@@ -46,9 +48,10 @@
     "DispatchLoweringPassPipeline",
     "identifier for pass pipeline use to lower dispatch region",
     [CPU_Default, CPU_SingleTilingExpert, CPU_DoubleTilingExpert,
-     CPU_TileFuseAndVectorize, LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize,
-     LLVMGPU_MatmulSimt, LLVMGPU_MatmulTensorCore, SPIRV_Distribute,
-     SPIRV_DistributeCopy, SPIRV_Vectorize,SPIRV_VectorizeToCooperativeOps,
+     CPU_ConvTileAndDecomposeExpert, CPU_TileFuseAndVectorize,
+     LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize, LLVMGPU_MatmulSimt,
+     LLVMGPU_MatmulTensorCore, SPIRV_Distribute, SPIRV_DistributeCopy,
+     SPIRV_Vectorize, SPIRV_VectorizeToCooperativeOps,
      None]> {
   let cppNamespace = "::mlir::iree_compiler::IREE::Codegen";
 }
diff --git a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 384c29e..fbdb83e 100644
--- a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -198,7 +198,8 @@
 /// Adjusts the workload per workgroup to be a multiple of vector size to ensure
 /// that the op vectorizes.
 static int64_t getMaxTileSize(int64_t lb, int64_t ub, int64_t maxSize,
-                              int64_t vectorSizeVal) {
+                              int64_t vectorSizeVal,
+                              Optional<int64_t> fallbackSizeVal = llvm::None) {
   if (ub == ShapedType::kDynamicSize || lb == ShapedType::kDynamicSize) {
     return maxSize;
   }
@@ -209,7 +210,7 @@
       return i;
     }
   }
-  return vectorSizeVal;
+  return fallbackSizeVal ? fallbackSizeVal.getValue() : vectorSizeVal;
 }
 
 /// Returns the tile size to use for the Flow level of an operation that
@@ -276,6 +277,22 @@
   return distributedLevelTileSizes;
 }
 
+/// Splits the tile sizes in parallelSizes into reductionSizes for the reduction
+/// loops.
+static void splitParallelAndReductionTiles(
+    linalg::LinalgOp op, SmallVectorImpl<int64_t> &parallelSizes,
+    SmallVectorImpl<int64_t> &reductionSizes) {
+  reductionSizes.assign(parallelSizes.begin(), parallelSizes.end());
+  for (auto iteratorType : llvm::enumerate(op.iterator_types())) {
+    if (iteratorType.value().cast<StringAttr>().getValue() ==
+        getParallelIteratorTypeName()) {
+      reductionSizes[iteratorType.index()] = 0;
+    } else {
+      parallelSizes[iteratorType.index()] = 0;
+    }
+  }
+}
+
 /// Sets the default configuration to use for an operation that implements the
 /// `PartitionableLoopsInterface`, given the iteration domain of all the loops.
 static LogicalResult setDefaultRootConfig(
@@ -580,9 +597,9 @@
   // Set the flow level tiling to the default.
   OpBuilder builder(genericOp.getContext());
   builder.setInsertionPoint(genericOp);
+  auto linalgOp = cast<linalg::LinalgOp>(genericOp.getOperation());
   SmallVector<Range> iterationDomain =
-      cast<linalg::LinalgOp>(genericOp.getOperation())
-          .createLoopRanges(builder, genericOp.getLoc());
+      linalgOp.createLoopRanges(builder, genericOp.getLoc());
   auto partitionableLoopsInterfaceOp =
       cast<IREE::Flow::PartitionableLoopsInterface>(genericOp.getOperation());
   SmallVector<int64_t> flowTileSizes = getDefaultDistributedLevelTileSizes(
@@ -592,7 +609,7 @@
   // Set the Next level tile sizes.
   SmallVector<int64_t> l1TileSizes(numLoops, 0);
   Optional<SmallVector<int64_t, 4>> staticLoopRanges =
-      cast<linalg::LinalgOp>(genericOp.getOperation()).getStaticLoopRanges();
+      linalgOp.getStaticLoopRanges();
   for (auto loopNum : llvm::seq<unsigned>(0, numLoops)) {
     if (flowTileSizes[loopNum]) {
       l1TileSizes[loopNum] =
@@ -607,16 +624,8 @@
               : minTileSizes[loopNum];
     }
   }
-
-  SmallVector<int64_t> vectorTileSizes = l1TileSizes;
-  for (auto iteratorType : llvm::enumerate(genericOp.iterator_types())) {
-    if (iteratorType.value().cast<StringAttr>().getValue() ==
-        getParallelIteratorTypeName()) {
-      vectorTileSizes[iteratorType.index()] = 0;
-    } else {
-      l1TileSizes[iteratorType.index()] = 0;
-    }
-  }
+  SmallVector<int64_t> vectorTileSizes;
+  splitParallelAndReductionTiles(linalgOp, l1TileSizes, vectorTileSizes);
 
   TileSizesListType tileSizes;
   tileSizes.push_back(flowTileSizes);
@@ -628,6 +637,57 @@
       DispatchLoweringPassPipeline::CPUDoubleTilingExpert);
 }
 
+/// Sets the lowering configuration for linalg.conv_2d_nhwc_hwcf operations.
+static LogicalResult setRootConfig(
+    FuncOp entryPointFn, linalg::Conv2DNhwcHwcfOp convOp,
+    ArrayRef<LoopTilingAndDistributionInfo> tiledLoops) {
+  auto linalgOp = cast<linalg::LinalgOp>(convOp.getOperation());
+  // Use the default distribution for the matmul loops.
+  unsigned numLoops = linalgOp.getNumLoops();
+  int64_t vectorSize =
+      getVectorSize(entryPointFn, convOp.getResult(0).getType());
+  SmallVector<int64_t> minTileSizes(numLoops, 1);
+  SmallVector<int64_t> maxTileSizes(numLoops, defaultWorkgroupTileSize);
+
+  // Set the flow level tiling to the default.
+  OpBuilder builder(convOp.getContext());
+  builder.setInsertionPoint(convOp);
+  SmallVector<Range> iterationDomain =
+      linalgOp.createLoopRanges(builder, convOp.getLoc());
+  auto partitionableLoopsInterfaceOp =
+      cast<IREE::Flow::PartitionableLoopsInterface>(convOp.getOperation());
+  SmallVector<int64_t> flowTileSizes = getDefaultDistributedLevelTileSizes(
+      iterationDomain, partitionableLoopsInterfaceOp, minTileSizes,
+      maxTileSizes);
+
+  // Shapes of N, OH, OW, OC, KH, KW, IC
+  Optional<SmallVector<int64_t, 4>> shapes = linalgOp.getStaticLoopRanges();
+  assert(shapes.hasValue() &&
+         "something went wrong when inferring loop ranges");
+
+  SmallVector<int64_t> l1TileSizes = {1, 1, 8, vectorSize * 2, 1, 1, 8};
+  for (auto i : llvm::seq<unsigned>(0, l1TileSizes.size())) {
+    auto tileSize = flowTileSizes[i] ? flowTileSizes[i] : shapes.getValue()[i];
+    // If the tile size is intended to be 1, do not adjust it to `vectorSize`.
+    // The ops will be decomposed to lower-rank named ops.
+    if (l1TileSizes[i] != 1) {
+      l1TileSizes[i] = getMaxTileSize(0, tileSize, l1TileSizes[i], vectorSize,
+                                      /*fallbackTileSize=*/1);
+    }
+  }
+  SmallVector<int64_t> vectorTileSizes;
+  splitParallelAndReductionTiles(linalgOp, l1TileSizes, vectorTileSizes);
+
+  TileSizesListType tileSizes;
+  tileSizes.push_back(flowTileSizes);
+  tileSizes.push_back(l1TileSizes);
+  tileSizes.push_back(vectorTileSizes);
+  return setOpConfigAndEntryPointFnTranslation(
+      entryPointFn, convOp, tileSizes,
+      /*nativeVectorSize=*/ArrayRef<int64_t>{},
+      DispatchLoweringPassPipeline::CPUConvTileAndDecomposeExpert);
+}
+
 /// Set default configuration for Linalg ops.
 static LogicalResult setRootConfig(
     FuncOp entryPointFn, linalg::LinalgOp linalgOp,
@@ -673,10 +733,10 @@
   // Redirect to individual operations.
   auto setRootConfigFn = [&](Operation *op) -> LogicalResult {
     return TypeSwitch<Operation *, LogicalResult>(op)
-        .Case<IREE::LinalgExt::FftOp, linalg::GenericOp, linalg::Mmt4DOp>(
-            [&](auto op) {
-              return setRootConfig(entryPointFn, op, tiledLoops);
-            })
+        .Case<IREE::LinalgExt::FftOp, linalg::GenericOp, linalg::Mmt4DOp,
+              linalg::Conv2DNhwcHwcfOp>([&](auto op) {
+          return setRootConfig(entryPointFn, op, tiledLoops);
+        })
         .Case<linalg::ContractionOpInterface>([&](auto op) {
           return setRootConfig(entryPointFn, op, tiledLoops);
         })
diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
index 3644e91..bd93ba6 100644
--- a/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
@@ -182,6 +182,10 @@
             addDoubleTilingExpertPassPipeline(nestedModulePM);
             break;
           case IREE::Codegen::DispatchLoweringPassPipeline::
+              CPUConvTileAndDecomposeExpert:
+            addConvTileAndDecomposeExpertPassPipeline(nestedModulePM);
+            break;
+          case IREE::Codegen::DispatchLoweringPassPipeline::
               CPUTileFuseAndVectorize:
             addTileFuseAndVectorizePassPipeline(nestedModulePM, lowerToVectors);
             break;
diff --git a/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 4f0cf24..ef7755d 100644
--- a/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -259,6 +259,62 @@
   }
 }
 
+void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager) {
+  // Do first level of tiling and distribution.
+  passManager.addNestedPass<FuncOp>(createTileAndDistributeToWorkgroupsPass());
+  passManager.addPass(createCanonicalizerPass());
+  passManager.addPass(createCSEPass());
+
+  passManager.addNestedPass<FuncOp>(
+      createConvertToDestinationPassingStylePass());
+
+  passManager.addPass(createCanonicalizerPass());
+
+  // Run LinalgFusePass firstly in case that we have fill + conv + generic
+  // ops. At this stage, we do not apply vectorization. The reduction dim won't
+  // get tiled if the case is conv + generic op. In this case, we have to tile
+  // along reduction dim again, which needs them to be Linalg ops form.
+  {
+    LinalgFusePassOptions options;
+    options.tilingLevel = static_cast<int64_t>(TilingLevel::L1Tiles);
+    passManager.addNestedPass<FuncOp>(createLinalgFusePass(options));
+    passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
+    passManager.addNestedPass<FuncOp>(createCSEPass());
+  }
+
+  // Add the sandbox single tiling expert to tile and vectorize.
+  {
+    LinalgSingleTilingExpertPassOptions options;
+    options.decomposeToLowerDimOp = true;
+    options.vectorize = true;
+    options.vectorizePadding = true;
+    options.tilingLevel = static_cast<int64_t>(TilingLevel::VectorTiles);
+    passManager.addNestedPass<FuncOp>(
+        createLinalgSingleTilingExpertPass(options));
+    passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
+    passManager.addNestedPass<FuncOp>(createCSEPass());
+  }
+
+  BufferizationOptions::AllocationFn allocationFn =
+      cpuComprehensiveBufferizeAllocationFn;
+  BufferizationOptions::DeallocationFn deallocationFn =
+      cpuComprehensiveBufferizeDeallocationFn;
+  BufferizationOptions::MemCpyFn memcpyFn = cpuComprehensiveBufferizeCopyFn;
+  addIREEComprehensiveBufferizePasses(passManager, allocationFn, deallocationFn,
+                                      memcpyFn);
+
+  // Run IREE specific passes before vector lowering expert.
+  passManager.addNestedPass<FuncOp>(createRemoveSingleIterationLoopPass());
+
+  // Add the vector lowering expert.
+  {
+    OpPassManager &nestedFuncPassManager = passManager.nest<FuncOp>();
+    LinalgVectorLoweringPassOptions options;
+    options.splitVectorTransfersTo = "shuffle";
+    addLowerToVectorTransforms(nestedFuncPassManager, options);
+  }
+}
+
 void addTileFuseAndVectorizePassPipeline(OpPassManager &passManager,
                                          bool lowerToVectors) {
   // Do first level of tile and distribute to workgroups.
diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
index 09d6dee..3769731 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
@@ -536,12 +536,11 @@
   }
 }
 
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[0, 64, 64, 64, 0, 0, 0]{{\]}}, native_vector_size = []>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUDefault", workload_per_wg = []>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[0, 64, 64, 64, 0, 0, 0], [1, 1, 8, 8, 0, 0, 0], [0, 0, 0, 0, 1, 1, 8]], native_vector_size = []>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUConvTileAndDecomposeExpert", workload_per_wg = []>
 //      CHECK: hal.executable.entry_point public @conv
 // CHECK-SAME:     translation.info = #[[TRANSLATION]]
 //      CHECK:     linalg.conv_2d_nhwc_hwcf
-//      CHECK:         lowering.config = #[[CONFIG]]
 
 // -----
 
@@ -561,6 +560,48 @@
     hal.executable.entry_point public @conv_static layout(#executable_layout)
     builtin.module {
       func @conv_static() {
+        %cst = arith.constant 0.000000e+00 : f32
+        %c0 = arith.constant 0 : index
+        %c607520 = arith.constant 607520 : index
+        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
+        %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c607520) alignment(32) : !flow.dispatch.tensor<readonly:3x3x3x16xf32>
+        %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<writeonly:1x112x112x16xf32>
+        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x225x225x3xf32>
+        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x16xf32> -> tensor<3x3x3x16xf32>
+        %5 = linalg.init_tensor [1, 112, 112, 16] : tensor<1x112x112x16xf32>
+        %6 = linalg.fill(%cst, %5) : f32, tensor<1x112x112x16xf32> -> tensor<1x112x112x16xf32>
+        %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
+        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 16], strides = [1, 1, 1, 1] : tensor<1x112x112x16xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x16xf32>
+        return
+      }
+    }
+  }
+}
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[0, 28, 28, 16, 0, 0, 0], [1, 1, 4, 8, 0, 0, 0], [0, 0, 0, 0, 1, 1, 3]], native_vector_size = []>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUConvTileAndDecomposeExpert", workload_per_wg = []>
+//      CHECK: hal.executable.entry_point public @conv
+// CHECK-SAME:     translation.info = #[[TRANSLATION]]
+//      CHECK:     linalg.conv_2d_nhwc_hwcf
+
+
+// -----
+
+#executable_layout = #hal.executable.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+hal.executable private @depthwise_conv_static {
+  hal.executable.variant public @system_elf_x86_64, target = <"llvm", "system-elf-x86_64", {
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    native_vector_size = 16 : index,
+    target_triple = "x86_64-unknown-linux-gnu"
+  }> {
+    hal.executable.entry_point public @depthwise_conv_static layout(#executable_layout)
+    builtin.module {
+      func @depthwise_conv_static() {
         %cst = arith.constant 0.0 : f32
         %input_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
             : !flow.dispatch.tensor<readonly:1x161x161x96xf32>
@@ -585,7 +626,7 @@
 }
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[0, 20, 40, 48, 0, 0]{{\]}}, native_vector_size = []>
 //  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUDefault", workload_per_wg = []>
-//      CHECK: hal.executable.entry_point public @conv_static
+//      CHECK: hal.executable.entry_point public @depthwise_conv_static
 // CHECK-SAME:     translation.info = #[[TRANSLATION]]
 //      CHECK:     linalg.depthwise_conv_2d_nhwc_hwc
 // CHECK-SAME:       lowering.config  = #[[CONFIG]]
diff --git a/iree/compiler/Codegen/Passes.h b/iree/compiler/Codegen/Passes.h
index d4867ca..5ae0ea4 100644
--- a/iree/compiler/Codegen/Passes.h
+++ b/iree/compiler/Codegen/Passes.h
@@ -244,6 +244,10 @@
     ArrayRef<int64_t> workgroupSize = {});
 void addDoubleTilingExpertPassPipeline(OpPassManager &passManager);
 
+// Populates the passes needed to do tiling, decomposing, and vectorizing the
+// convolution ops using the Codegen drivers from sandbox.
+void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager);
+
 /// Populates the passes needed to multi level tile, fuse and vectorize lowering
 /// of linalg ops on tensors to vectors operations.
 void addTileFuseAndVectorizePassPipeline(OpPassManager &passManager,