[LLVMGPU] Fix linear dim selection in GPUApplyTilingLevel (#17611) `GPUApplyTilingLevel` skips over dimensions with no tiling (tile size of zero), and the linear dim mapping in the forall loop also skips the indices of the dimensions. This PR fixes this so the linear dim mappings always increment by 1 starting from 0. Signed-off-by: Max Dawkins <max.dawkins@gmail.com>

commit: f062b19dfffeaf4e567b6f8793e0f40ff08cf525 [log] [tgz]
author: Max191 <44243577+Max191@users.noreply.github.com> Fri Jun 07 14:49:23 2024 -0700
committer: GitHub <noreply@github.com> Fri Jun 07 17:49:23 2024 -0400
tree: 96c167b0f1c078c482cf1803639e9648db98f996
parent: 2baf6c377e135059b881b5b8c8b076c642963faf [diff]
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp
index 6862aff..acd1890 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp

@@ -107,10 +107,11 @@
       // TODO: Add some helpers to construct this based on the enum type rather
       // than doing it here.
       SmallVector<DeviceMappingAttrInterface> mapping;
-      for (auto [idx, size] : llvm::enumerate(tileSizes)) {
+      int idx = 0;
+      for (auto size : tileSizes) {
         if (!isConstantIntValue(size, 0)) {
           unsigned mappingId =
-              static_cast<unsigned>(gpu::MappingId::LinearDim0) + idx;
+              static_cast<unsigned>(gpu::MappingId::LinearDim0) + idx++;
           mapping.push_back(gpu::GPUThreadMappingAttr::get(
               context, static_cast<gpu::MappingId>(mappingId)));
         }

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir
index 8c76a6e..ab39fc2 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir

@@ -37,6 +37,42 @@
 
 // -----
 
+#config = #iree_gpu.lowering_config<{thread = [0, 16]}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+  func.func @sequential_forall_mappings() {
+    %c0 = arith.constant 0 : index
+    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<4x256xf32>>
+    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<4x256xf32>>
+    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4x256xf32>>
+    %3 = flow.dispatch.tensor.load %0, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x256xf32>> -> tensor<4x256xf32>
+    %4 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x256xf32>> -> tensor<4x256xf32>
+    %5 = flow.dispatch.tensor.load %2, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<4x256xf32>> -> tensor<4x256xf32>
+    %6 = linalg.generic {
+      indexing_maps = [#map, #map, #map],
+      iterator_types = ["parallel", "parallel"]
+      } ins(%3, %4 : tensor<4x256xf32>, tensor<4x256xf32>) outs(%5 : tensor<4x256xf32>) attrs =  {lowering_config = #config} {
+    ^bb0(%in: f32, %in_0: f32, %out: f32):
+      %7 = arith.addf %in, %in_0 : f32
+      linalg.yield %7 : f32
+    } -> tensor<4x256xf32>
+    flow.dispatch.tensor.store %6, %2, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : tensor<4x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x256xf32>>
+    return
+  }
+}
+
+// Verify that no loops are generated without a reduction configuration.
+// CHECK-LABEL: func.func @sequential_forall_mappings
+//   CHECK-NOT:   scf.for
+
+// THREAD-LABEL: func.func @sequential_forall_mappings
+//       THREAD:   scf.forall ({{.*}}) = (0) to (256) step (16)
+//       THREAD:     linalg.generic {{.*}} ins(%{{.*}}: tensor<4x16xf32>, tensor<4x16xf32>)
+//       THREAD:     scf.forall.in_parallel
+//       THREAD:   mapping = [#gpu.thread<linear_dim_0>]
+
+// -----
+
 module {
   func.func @matmul_transpose_b() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [128, 2, 1] subgroup_size = 64>} {
     %c4 = arith.constant 4 : index
commit	f062b19dfffeaf4e567b6f8793e0f40ff08cf525	[log] [tgz]
author	Max191 <44243577+Max191@users.noreply.github.com>	Fri Jun 07 14:49:23 2024 -0700
committer	GitHub <noreply@github.com>	Fri Jun 07 17:49:23 2024 -0400
tree	96c167b0f1c078c482cf1803639e9648db98f996
parent	2baf6c377e135059b881b5b8c8b076c642963faf [diff]