[LLVMGPU] Add im2col pipeline for convolution codegen (#18086)

This PR adds the remaining needed passes for the IGEMM pipeline using
the im2col op. It adds the `Conv2DToIm2colOp` pass with a flag
`--iree-codegen-llvmgpu-use-igemm`, and it adds the im2col decomposition
pass before vectorization passes.

`--iree-codegen-llvmgpu-use-igemm` will be false by default until the
IGEMM pipeline is more robust and performant.

---------

Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index f2a9759..47d137e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -72,6 +72,11 @@
                    "allocated for the given target"),
     llvm::cl::init(163 * 1024));
 
+static llvm::cl::opt<bool>
+    clLLVMGPUUseIgemm("iree-codegen-llvmgpu-use-igemm",
+                      llvm::cl::desc("Enable implicit gemm for convolutions."),
+                      llvm::cl::init(false));
+
 llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
                               const LLVMGPUPipelineOptions &options) {
   StringRef reorderStr = "<not set>";
@@ -241,6 +246,7 @@
 
 static void addGPUVectorizationPasses(OpPassManager &funcPassManager) {
   funcPassManager.addPass(createDecomposeConvolutionToLowerDimOpsPass());
+  funcPassManager.addPass(IREE::LinalgExt::createDecomposeIm2colPass());
   // Vectorize.
   GenericVectorizationPassOptions options;
   options.vectorizePadding = true;
@@ -1043,6 +1049,8 @@
     OpPassManager &modulePassManager) {
   {
     FunctionLikeNest funcPassManager(modulePassManager);
+    funcPassManager.addPredicatedPass(
+        clLLVMGPUUseIgemm, IREE::LinalgExt::createConvertConv2DToIm2ColOpPass);
     funcPassManager.addPass(createGPUGeneralizeNamedOpsPass);
     addCommonTargetExecutablePreprocessingPasses(funcPassManager);
     addEncodingToNopPasses(funcPassManager);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
index 95463a8..1651697 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -126,3 +126,93 @@
 //       CHECK:     scf.yield %[[MM]]
 //       CHECK:   %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 2, 1, 3] : vector<2x2x4x1xf32> to vector<2x4x2x1xf32>
 //       CHECK:   vector.transfer_write %[[LOOP_T]], %[[B2]]
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer, ReadOnly>,
+    #hal.descriptor_set.binding<1, storage_buffer, ReadOnly>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+#config = #iree_gpu.lowering_config<{workgroup = [1, 64, 64, 0], reduction = [0, 0, 0, 2], subgroup = [1, 2, 2], mma_kind = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>}>
+hal.executable private @main {
+  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export public @conv_igemm_im2col ordinal(0) layout(#pipeline_layout)
+      attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
+    ^bb0(%arg0: !hal.device):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @conv_igemm_im2col() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [128, 2, 1] subgroup_size = 64>} {
+        %cst = arith.constant 0.000000e+00 : f32
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x34x34x1280xf16>>
+        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<3x3x1280x1280xf16>>
+        %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x16x16x1280xf32>>
+        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 34, 34, 1280], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x34x34x1280xf16>> -> tensor<2x34x34x1280xf16>
+        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 1280, 1280], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x1280x1280xf16>> -> tensor<3x3x1280x1280xf16>
+        %5 = tensor.empty() : tensor<2x16x16x1280xf32>
+        %6 = tensor.empty() : tensor<2x256x11520xf16>
+        %7 = iree_linalg_ext.im2col
+            strides = [2, 2] dilations = [1, 1] kernel_size = [3, 3]
+            m_offset = [0] k_offset = [0]
+            batch_pos = [0] m_pos = [1, 2] k_pos = [3]
+          ins(%3 : tensor<2x34x34x1280xf16>)
+          outs(%6 : tensor<2x256x11520xf16>) -> tensor<2x256x11520xf16>
+        %collapsed = tensor.collapse_shape %4 [[0, 1, 2], [3]] : tensor<3x3x1280x1280xf16> into tensor<11520x1280xf16>
+        %collapsed_0 = tensor.collapse_shape %5 [[0], [1, 2], [3]] : tensor<2x16x16x1280xf32> into tensor<2x256x1280xf32>
+        %8 = linalg.fill ins(%cst : f32) outs(%collapsed_0 : tensor<2x256x1280xf32>) -> tensor<2x256x1280xf32>
+        %9 = linalg.generic {
+          indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
+                           affine_map<(d0, d1, d2, d3) -> (d3, d2)>,
+                           affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>],
+          iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+          ins(%7, %collapsed : tensor<2x256x11520xf16>, tensor<11520x1280xf16>)
+          outs(%8 : tensor<2x256x1280xf32>) attrs =  {lowering_config = #config} {
+        ^bb0(%in: f16, %in_1: f16, %out: f32):
+          %10 = arith.extf %in : f16 to f32
+          %11 = arith.extf %in_1 : f16 to f32
+          %12 = arith.mulf %10, %11 : f32
+          %13 = arith.addf %12, %out : f32
+          linalg.yield %13 : f32
+        } -> tensor<2x256x1280xf32>
+        %expanded = tensor.expand_shape %9 [[0], [1, 2], [3]] output_shape [2, 16, 16, 1280] : tensor<2x256x1280xf32> into tensor<2x16x16x1280xf32>
+        flow.dispatch.tensor.store %expanded, %2, offsets = [0, 0, 0, 0], sizes = [2, 16, 16, 1280], strides = [1, 1, 1, 1] : tensor<2x16x16x1280xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x16x16x1280xf32>>
+        return
+      }
+    }
+  }
+}
+
+// CHECK-LABEL: func @conv_igemm_im2col
+//   CHECK-DAG:   %[[B0:.+]] = hal.interface.binding.subspan set(0) binding(0)
+//   CHECK-DAG:   %[[B1:.+]] = hal.interface.binding.subspan set(0) binding(1)
+//   CHECK-DAG:   %[[B2:.+]] = hal.interface.binding.subspan set(0) binding(2)
+//   CHECK-DAG:   memref.alloc() : memref<1x64x32xf16, #gpu.address_space<workgroup>>
+//   CHECK-DAG:   memref.alloc() : memref<32x64xf16, #gpu.address_space<workgroup>>
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C720:.+]] = arith.constant 720 : index
+//   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+//       CHECK:   %[[LOOP:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[C720]] step %[[C2]] {{.*}} -> (vector<1x2x2x4x1xf32>)
+//       CHECK:     gpu.barrier
+//       CHECK:     %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16>
+//       CHECK:     vector.transfer_write %[[LHS_RD]]
+//       CHECK:     gpu.barrier
+//       CHECK:     %[[LHS_MM0:.+]] = vector.transfer_read {{.*}} vector<2x1x2x4xf16>
+//       CHECK:     %[[LHS_MM1:.+]] = vector.broadcast {{.*}} vector<2x1x2x4xf16> to vector<1x2x1x2x4xf16>
+//       CHECK:     gpu.barrier
+//       CHECK:     %[[LHS_T:.+]] = vector.transpose %[[LHS_MM1]], [0, 1, 3, 2, 4] : vector<1x2x1x2x4xf16> to vector<1x2x2x1x4xf16>
+//       CHECK:     %[[RHS_RD:.+]] = vector.transfer_read %[[B1]]{{.*}} vector<8xf16>
+//       CHECK:     vector.transfer_write %[[RHS_RD]]
+//       CHECK:     gpu.barrier
+//       CHECK:     %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x4x2x1xf16>
+//       CHECK:     gpu.barrier
+//       CHECK:     %[[RHS_T:.+]] = vector.transpose %[[RHS_MM]], [0, 2, 3, 1] : vector<2x4x2x1xf16> to vector<2x2x1x4xf16>
+//       CHECK:     %[[MM:.+]] = iree_gpu.multi_mma %[[LHS_T]], %[[RHS_T]]
+//       CHECK:     scf.yield %[[MM]]
+//       CHECK:   %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 1, 3, 2, 4] : vector<1x2x2x4x1xf32> to vector<1x2x4x2x1xf32>
+//       CHECK:   %[[EXTRACT:.+]] = vector.extract %[[LOOP_T]][0] : vector<2x4x2x1xf32> from vector<1x2x4x2x1xf32>
+//       CHECK:   vector.transfer_write %[[EXTRACT]], %[[B2]]