[LLVMGPU] Fix linear dim selection in GPUApplyTilingLevel (#17611)

`GPUApplyTilingLevel` skips over dimensions with no tiling (tile size of
zero), and the linear dim mapping in the forall loop also skips the
indices of the dimensions. This PR fixes this so the linear dim mappings
always increment by 1 starting from 0.

Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp
index 6862aff..acd1890 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp
@@ -107,10 +107,11 @@
       // TODO: Add some helpers to construct this based on the enum type rather
       // than doing it here.
       SmallVector<DeviceMappingAttrInterface> mapping;
-      for (auto [idx, size] : llvm::enumerate(tileSizes)) {
+      int idx = 0;
+      for (auto size : tileSizes) {
         if (!isConstantIntValue(size, 0)) {
           unsigned mappingId =
-              static_cast<unsigned>(gpu::MappingId::LinearDim0) + idx;
+              static_cast<unsigned>(gpu::MappingId::LinearDim0) + idx++;
           mapping.push_back(gpu::GPUThreadMappingAttr::get(
               context, static_cast<gpu::MappingId>(mappingId)));
         }
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir
index 8c76a6e..ab39fc2 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir
@@ -37,6 +37,42 @@
 
 // -----
 
+#config = #iree_gpu.lowering_config<{thread = [0, 16]}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+  func.func @sequential_forall_mappings() {
+    %c0 = arith.constant 0 : index
+    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<4x256xf32>>
+    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<4x256xf32>>
+    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4x256xf32>>
+    %3 = flow.dispatch.tensor.load %0, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x256xf32>> -> tensor<4x256xf32>
+    %4 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x256xf32>> -> tensor<4x256xf32>
+    %5 = flow.dispatch.tensor.load %2, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<4x256xf32>> -> tensor<4x256xf32>
+    %6 = linalg.generic {
+      indexing_maps = [#map, #map, #map],
+      iterator_types = ["parallel", "parallel"]
+      } ins(%3, %4 : tensor<4x256xf32>, tensor<4x256xf32>) outs(%5 : tensor<4x256xf32>) attrs =  {lowering_config = #config} {
+    ^bb0(%in: f32, %in_0: f32, %out: f32):
+      %7 = arith.addf %in, %in_0 : f32
+      linalg.yield %7 : f32
+    } -> tensor<4x256xf32>
+    flow.dispatch.tensor.store %6, %2, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : tensor<4x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x256xf32>>
+    return
+  }
+}
+
+// Verify that no loops are generated without a reduction configuration.
+// CHECK-LABEL: func.func @sequential_forall_mappings
+//   CHECK-NOT:   scf.for
+
+// THREAD-LABEL: func.func @sequential_forall_mappings
+//       THREAD:   scf.forall ({{.*}}) = (0) to (256) step (16)
+//       THREAD:     linalg.generic {{.*}} ins(%{{.*}}: tensor<4x16xf32>, tensor<4x16xf32>)
+//       THREAD:     scf.forall.in_parallel
+//       THREAD:   mapping = [#gpu.thread<linear_dim_0>]
+
+// -----
+
 module {
   func.func @matmul_transpose_b() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [128, 2, 1] subgroup_size = 64>} {
     %c4 = arith.constant 4 : index