[AMDGPU][LDS] Support linearized DMA for small innermost dimensions (#23056)

When the linalg.copy output comes from tensor.empty(), the
`AMDGPULowerCoalescedDMAToGatherLDS` pass can linearize the destination
memref.

This patch allows coalesced DMA for tensors with small innermost
dimensions (e.g., `128x16xf32` where `16 < minElementsPerTransfer=64`)
by linearizing the tensor and using total elements instead of innermost
dimension for the size check.

This change gates the optimization: The change in
GPUConvertToCoalescedDMA simply allows copies with small innermost
dimensions to proceed when linearization is safe, instead of rejecting
them outright.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp
index 8d4f0fa..ec94c32 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp
@@ -58,6 +58,45 @@
   return mapping;
 }
 
+/// Check if a value traces back to tensor.empty (possibly through forall args).
+static bool tracesToTensorEmpty(Value value) {
+  // Direct tensor.empty.
+  if (value.getDefiningOp<tensor::EmptyOp>()) {
+    return true;
+  }
+
+  // Check if value is an extract_slice from a forall block argument.
+  auto extractSlice = value.getDefiningOp<tensor::ExtractSliceOp>();
+  if (!extractSlice) {
+    return false;
+  }
+
+  auto blockArg = dyn_cast<BlockArgument>(extractSlice.getSource());
+  if (!blockArg) {
+    return false;
+  }
+
+  auto forallOp = dyn_cast<scf::ForallOp>(blockArg.getOwner()->getParentOp());
+  if (!forallOp) {
+    return false;
+  }
+
+  // Find the corresponding shared_out init value.
+  unsigned numIVs = forallOp.getInductionVars().size();
+  unsigned argIndex = blockArg.getArgNumber();
+  if (argIndex < numIVs) {
+    return false;
+  }
+
+  unsigned sharedOutIndex = argIndex - numIVs;
+  if (sharedOutIndex >= forallOp.getOutputs().size()) {
+    return false;
+  }
+
+  Value initValue = forallOp.getOutputs()[sharedOutIndex];
+  return initValue.getDefiningOp<tensor::EmptyOp>() != nullptr;
+}
+
 /// Helper to compute thread number of threads based on translation_info.
 /// Uses the subgroup_size from translation_info for thread-level tiling.
 static SmallVector<OpFoldResult>
@@ -117,9 +156,21 @@
         std::min(minElementsPerTransfer, elementsPerTransfer);
   }
 
-  // If no valid DMA size found or innermost dim is too small, skip.
+  // Determine how many elements are available for coalesced access.
+  // For CopyOp with output tracing to tensor.empty(), we can linearize.
+  ArrayRef<int64_t> shape = outputType.getShape();
+  int64_t availableElements = innermostDim;
+  if (auto copyOp = dyn_cast<linalg::CopyOp>(op)) {
+    Value output = copyOp.getOutputs()[0];
+    if (tracesToTensorEmpty(output) &&
+        llvm::none_of(shape, ShapedType::isDynamic)) {
+      availableElements = ShapedType::getNumElements(shape);
+    }
+  }
+
+  // If no valid DMA size found or available elements are too small, skip.
   if (minElementsPerTransfer == std::numeric_limits<int64_t>::max() ||
-      innermostDim < minElementsPerTransfer) {
+      availableElements < minElementsPerTransfer) {
     return {};
   }
 
@@ -583,9 +634,11 @@
     int64_t totalWarps = llvm::product_of(positiveWarps);
 
     // Greedily distribute warps to outer dimensions, keeping innermost whole.
+    // For 1D tensors, distribute across the single dimension (no inner/outer).
     int64_t remainingWarps = totalWarps;
     for (int64_t i = 0; i < rank; ++i) {
-      if (i == rank - 1) {
+      bool isInnermostOfMultiDim = (i == rank - 1) && (rank > 1);
+      if (isInnermostOfMultiDim) {
         // Keep innermost dimension whole (tile size = full dimension).
         tileSizes.push_back(rewriter.getIndexAttr(shape[i]));
         ++numTiledDims;
@@ -684,9 +737,23 @@
           std::min(minElementsPerTransfer, elementsPerTransfer);
     }
 
-    // If no valid DMA size found or innermost dim is too small, skip.
+    // Determine how many elements are available for coalesced access.
+    // For CopyOp with tensor.empty() output, we can linearize all dimensions.
+    // Otherwise, we can only use the innermost dimension.
+    int64_t availableElements = innermostDim;
+    if (auto copyOp = dyn_cast<linalg::CopyOp>(op.getOperation())) {
+      Value output = copyOp.getOutputs()[0];
+      if (output.getDefiningOp<tensor::EmptyOp>()) {
+        // Can linearize all dimensions - compute total static elements.
+        if (llvm::none_of(shape, ShapedType::isDynamic)) {
+          availableElements = ShapedType::getNumElements(shape);
+        }
+      }
+    }
+
+    // If no valid DMA size found or available elements are too small, skip.
     if (minElementsPerTransfer == std::numeric_limits<int64_t>::max() ||
-        innermostDim < minElementsPerTransfer) {
+        availableElements < minElementsPerTransfer) {
       return failure();
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir
index e9a35e6..bb52bb4 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir
@@ -200,3 +200,218 @@
 
   return %result : tensor<64x128xf32>
 }
+
+// -----
+
+// Test: Small innermost dimension with tensor.empty() output CAN be linearized.
+// When output comes from tensor.empty(), we can use total elements instead of
+// innermost dimension for the size check, enabling coalesced DMA.
+
+#gpu_target_linearize = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+  compute = fp32, storage = b32, subgroup = shuffle,
+  max_load_instruction_bits = 128, subgroup_size_choices = [64],
+  max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
+  max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647],
+  dma_sizes = [32]
+>>
+
+#exec_target_linearize = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #gpu_target_linearize}>
+#translation_linearize = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64>
+
+// CHECK-LABEL: func.func @copy_small_innermost_linearized
+// CHECK-SAME:    %[[SRC:[a-zA-Z0-9]+]]: tensor<128x16xf32>
+func.func @copy_small_innermost_linearized(%source: tensor<128x16xf32>) -> tensor<128x16xf32>
+  attributes {hal.executable.target = #exec_target_linearize, translation_info = #translation_linearize} {
+  %empty = tensor.empty() : tensor<128x16xf32>
+  %result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma}
+    ins(%source : tensor<128x16xf32>)
+    outs(%empty : tensor<128x16xf32>) -> tensor<128x16xf32>
+
+  // Innermost dimension (16) < minElementsPerTransfer (64), but since output is
+  // tensor.empty(), we use total elements (2048) for the check, which passes.
+  // With 4 warps (256/64), rows are tiled to 32 (128/4), columns kept whole at 16.
+
+  // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<128x16xf32>
+
+  // Warp-level forall: step (32, 16) distributes 128 rows across 4 warps
+  // CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) = (0, 0) to (128, 16) step (32, 16)
+  // CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[EMPTY]]) -> (tensor<128x16xf32>) {
+  // CHECK:   %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], 0] [32, 16] [1, 1]
+  // CHECK-SAME:   : tensor<128x16xf32> to tensor<32x16xf32>
+  // CHECK:   %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], 0] [32, 16] [1, 1]
+  // CHECK-SAME:   : tensor<128x16xf32> to tensor<32x16xf32>
+
+  // Thread-level forall with 64 lanes
+  // CHECK:   %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64)
+  // CHECK-SAME:   shared_outs(%[[THREAD_INIT:.+]] = %[[SLICE_DST]]) -> (tensor<32x16xf32>) {
+  // CHECK:     scf.forall.in_parallel {
+  // CHECK:       iree_gpu.coalesced_gather_dma %[[SLICE_SRC]] into %[[THREAD_INIT]] lane(%[[LANE]])
+  // CHECK-SAME:       : tensor<32x16xf32>, tensor<32x16xf32>, index
+  // CHECK:     }
+  // CHECK:   } {mapping = [#iree_gpu.lane_id<0>]}
+
+  // CHECK:   scf.forall.in_parallel {
+  // CHECK:     tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], 0] [32, 16] [1, 1]
+  // CHECK-SAME:     : tensor<32x16xf32> into tensor<128x16xf32>
+  // CHECK:   }
+  // CHECK: }
+
+  // CHECK: return %[[WARP_RESULT]]
+  // CHECK-NOT: linalg.copy
+
+  return %result : tensor<128x16xf32>
+}
+
+// -----
+
+// Test: 1D tensor copy distributes warps across the single dimension.
+// This tests the 1D tile size computation logic for flattened copies.
+
+#gpu_target_1d = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+  compute = fp32, storage = b32, subgroup = shuffle,
+  max_load_instruction_bits = 128, subgroup_size_choices = [64],
+  max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
+  max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647],
+  dma_sizes = [32]
+>>
+
+#exec_target_1d = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #gpu_target_1d}>
+#translation_1d = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64>
+
+// CHECK-LABEL: func.func @copy_1d_tensor
+// CHECK-SAME:    %[[SRC:[a-zA-Z0-9]+]]: tensor<2048xf32>
+func.func @copy_1d_tensor(%source: tensor<2048xf32>) -> tensor<2048xf32>
+  attributes {hal.executable.target = #exec_target_1d, translation_info = #translation_1d} {
+  %empty = tensor.empty() : tensor<2048xf32>
+  %result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma}
+    ins(%source : tensor<2048xf32>)
+    outs(%empty : tensor<2048xf32>) -> tensor<2048xf32>
+
+  // With 4 warps (256/64) and 2048 elements:
+  // - Tile size = ceil(2048/4) = 512 elements per warp
+  // - Step = 512, distributing the single dimension across warps
+
+  // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<2048xf32>
+
+  // Warp-level forall: step (512) distributes 2048 elements across 4 warps
+  // CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV:.+]]) = (0) to (2048) step (512)
+  // CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[EMPTY]]) -> (tensor<2048xf32>) {
+  // CHECK:   %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV]]] [512] [1]
+  // CHECK-SAME:   : tensor<2048xf32> to tensor<512xf32>
+  // CHECK:   %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV]]] [512] [1]
+  // CHECK-SAME:   : tensor<2048xf32> to tensor<512xf32>
+
+  // Thread-level forall with 64 lanes
+  // CHECK:   %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64)
+  // CHECK-SAME:   shared_outs(%[[THREAD_INIT:.+]] = %[[SLICE_DST]]) -> (tensor<512xf32>) {
+  // CHECK:     scf.forall.in_parallel {
+  // CHECK:       iree_gpu.coalesced_gather_dma %[[SLICE_SRC]] into %[[THREAD_INIT]] lane(%[[LANE]])
+  // CHECK-SAME:       : tensor<512xf32>, tensor<512xf32>, index
+  // CHECK:     }
+  // CHECK:   } {mapping = [#iree_gpu.lane_id<0>]}
+
+  // CHECK:   scf.forall.in_parallel {
+  // CHECK:     tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV]]] [512] [1]
+  // CHECK-SAME:     : tensor<512xf32> into tensor<2048xf32>
+  // CHECK:   }
+  // CHECK: }
+
+  // CHECK: return %[[WARP_RESULT]]
+  // CHECK-NOT: linalg.copy
+
+  return %result : tensor<2048xf32>
+}
+
+// -----
+
+// Negative test: Small innermost dimension with non-tensor.empty output should
+// NOT be linearized. The copy should remain unchanged because:
+// 1. Innermost dim (16) < minElementsPerTransfer (64)
+// 2. Output is a function argument, not tensor.empty, so we can't linearize
+
+#gpu_target_no_linearize = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+  compute = fp32, storage = b32, subgroup = shuffle,
+  max_load_instruction_bits = 128, subgroup_size_choices = [64],
+  max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
+  max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647],
+  dma_sizes = [32]
+>>
+
+#exec_target_no_linearize = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #gpu_target_no_linearize}>
+#translation_no_linearize = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64>
+
+// CHECK-LABEL: func.func @copy_small_innermost_no_linearize
+// CHECK-SAME:    %[[SRC:[a-zA-Z0-9]+]]: tensor<128x16xf32>
+// CHECK-SAME:    %[[DST:[a-zA-Z0-9]+]]: tensor<128x16xf32>
+func.func @copy_small_innermost_no_linearize(%source: tensor<128x16xf32>, %dest: tensor<128x16xf32>) -> tensor<128x16xf32>
+  attributes {hal.executable.target = #exec_target_no_linearize, translation_info = #translation_no_linearize} {
+  %result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma}
+    ins(%source : tensor<128x16xf32>)
+    outs(%dest : tensor<128x16xf32>) -> tensor<128x16xf32>
+
+  // Innermost dimension (16) < minElementsPerTransfer (64), and output is not
+  // tensor.empty(), so linearization is not possible. The copy should remain.
+
+  // CHECK: %[[RESULT:.+]] = linalg.copy
+  // CHECK-SAME: ins(%[[SRC]] : tensor<128x16xf32>)
+  // CHECK-SAME: outs(%[[DST]] : tensor<128x16xf32>)
+  // CHECK: return %[[RESULT]]
+
+  return %result : tensor<128x16xf32>
+}
+
+// -----
+
+// Test: Copy with extract_slice input (source from a slice of a larger tensor).
+// The copy should be converted to coalesced DMA when the input comes from an
+// extract_slice with contiguous innermost dimensions.
+
+#gpu_target_extract_input = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+  compute = fp32, storage = b32, subgroup = shuffle,
+  max_load_instruction_bits = 128, subgroup_size_choices = [64],
+  max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
+  max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647],
+  dma_sizes = [32]
+>>
+
+#exec_target_extract_input = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #gpu_target_extract_input}>
+#translation_extract_input = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64>
+
+// CHECK-LABEL: func.func @copy_with_extract_slice_input
+// CHECK-SAME:    %[[LARGE_SRC:[a-zA-Z0-9]+]]: tensor<256x128xf32>
+func.func @copy_with_extract_slice_input(%large_source: tensor<256x128xf32>) -> tensor<64x128xf32>
+  attributes {hal.executable.target = #exec_target_extract_input, translation_info = #translation_extract_input} {
+  // Extract a contiguous slice from the larger source tensor.
+  // The innermost dimension (128) is fully taken, so this is a contiguous slice.
+  %c32 = arith.constant 32 : index
+  %slice = tensor.extract_slice %large_source[%c32, 0] [64, 128] [1, 1]
+    : tensor<256x128xf32> to tensor<64x128xf32>
+
+  %empty = tensor.empty() : tensor<64x128xf32>
+  %result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma}
+    ins(%slice : tensor<64x128xf32>)
+    outs(%empty : tensor<64x128xf32>) -> tensor<64x128xf32>
+
+  // The copy should be converted to coalesced DMA.
+  // With 4 warps (256/64) and 64x128 tensor:
+  // - Rows are tiled: step = 64/4 = 16
+  // - Columns are NOT tiled (step = 128, full dimension) to ensure contiguous subviews
+
+  // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<64x128xf32>
+  // CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) = (0, 0) to (64, 128) step (16, 128)
+  // CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[EMPTY]]) -> (tensor<64x128xf32>) {
+
+  // The source slice is further sliced for each warp
+  // CHECK:   %[[SLICE_SRC:.+]] = tensor.extract_slice
+  // CHECK:   %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]]
+
+  // Thread-level forall with 64 lanes
+  // CHECK:   %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64)
+  // CHECK:     iree_gpu.coalesced_gather_dma
+  // CHECK:   } {mapping = [#iree_gpu.lane_id<0>]}
+
+  // CHECK: return %[[WARP_RESULT]]
+  // CHECK-NOT: linalg.copy
+
+  return %result : tensor<64x128xf32>
+}