[AMDGPU][LDS] Support linearized DMA for small innermost dimensions (#23056)
When the linalg.copy output comes from tensor.empty(), the
`AMDGPULowerCoalescedDMAToGatherLDS` pass can linearize the destination
memref.
This patch allows coalesced DMA for tensors with small innermost
dimensions (e.g., `128x16xf32` where `16 < minElementsPerTransfer=64`)
by linearizing the tensor and using total elements instead of innermost
dimension for the size check.
This change gates the optimization: The change in
GPUConvertToCoalescedDMA simply allows copies with small innermost
dimensions to proceed when linearization is safe, instead of rejecting
them outright.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp
index 8d4f0fa..ec94c32 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp
@@ -58,6 +58,45 @@
return mapping;
}
+/// Check if a value traces back to tensor.empty (possibly through forall args).
+static bool tracesToTensorEmpty(Value value) {
+ // Direct tensor.empty.
+ if (value.getDefiningOp<tensor::EmptyOp>()) {
+ return true;
+ }
+
+ // Check if value is an extract_slice from a forall block argument.
+ auto extractSlice = value.getDefiningOp<tensor::ExtractSliceOp>();
+ if (!extractSlice) {
+ return false;
+ }
+
+ auto blockArg = dyn_cast<BlockArgument>(extractSlice.getSource());
+ if (!blockArg) {
+ return false;
+ }
+
+ auto forallOp = dyn_cast<scf::ForallOp>(blockArg.getOwner()->getParentOp());
+ if (!forallOp) {
+ return false;
+ }
+
+ // Find the corresponding shared_out init value.
+ unsigned numIVs = forallOp.getInductionVars().size();
+ unsigned argIndex = blockArg.getArgNumber();
+ if (argIndex < numIVs) {
+ return false;
+ }
+
+ unsigned sharedOutIndex = argIndex - numIVs;
+ if (sharedOutIndex >= forallOp.getOutputs().size()) {
+ return false;
+ }
+
+ Value initValue = forallOp.getOutputs()[sharedOutIndex];
+ return initValue.getDefiningOp<tensor::EmptyOp>() != nullptr;
+}
+
/// Helper to compute thread number of threads based on translation_info.
/// Uses the subgroup_size from translation_info for thread-level tiling.
static SmallVector<OpFoldResult>
@@ -117,9 +156,21 @@
std::min(minElementsPerTransfer, elementsPerTransfer);
}
- // If no valid DMA size found or innermost dim is too small, skip.
+ // Determine how many elements are available for coalesced access.
+ // For CopyOp with output tracing to tensor.empty(), we can linearize.
+ ArrayRef<int64_t> shape = outputType.getShape();
+ int64_t availableElements = innermostDim;
+ if (auto copyOp = dyn_cast<linalg::CopyOp>(op)) {
+ Value output = copyOp.getOutputs()[0];
+ if (tracesToTensorEmpty(output) &&
+ llvm::none_of(shape, ShapedType::isDynamic)) {
+ availableElements = ShapedType::getNumElements(shape);
+ }
+ }
+
+ // If no valid DMA size found or available elements are too small, skip.
if (minElementsPerTransfer == std::numeric_limits<int64_t>::max() ||
- innermostDim < minElementsPerTransfer) {
+ availableElements < minElementsPerTransfer) {
return {};
}
@@ -583,9 +634,11 @@
int64_t totalWarps = llvm::product_of(positiveWarps);
// Greedily distribute warps to outer dimensions, keeping innermost whole.
+ // For 1D tensors, distribute across the single dimension (no inner/outer).
int64_t remainingWarps = totalWarps;
for (int64_t i = 0; i < rank; ++i) {
- if (i == rank - 1) {
+ bool isInnermostOfMultiDim = (i == rank - 1) && (rank > 1);
+ if (isInnermostOfMultiDim) {
// Keep innermost dimension whole (tile size = full dimension).
tileSizes.push_back(rewriter.getIndexAttr(shape[i]));
++numTiledDims;
@@ -684,9 +737,23 @@
std::min(minElementsPerTransfer, elementsPerTransfer);
}
- // If no valid DMA size found or innermost dim is too small, skip.
+ // Determine how many elements are available for coalesced access.
+ // For CopyOp with tensor.empty() output, we can linearize all dimensions.
+ // Otherwise, we can only use the innermost dimension.
+ int64_t availableElements = innermostDim;
+ if (auto copyOp = dyn_cast<linalg::CopyOp>(op.getOperation())) {
+ Value output = copyOp.getOutputs()[0];
+ if (output.getDefiningOp<tensor::EmptyOp>()) {
+ // Can linearize all dimensions - compute total static elements.
+ if (llvm::none_of(shape, ShapedType::isDynamic)) {
+ availableElements = ShapedType::getNumElements(shape);
+ }
+ }
+ }
+
+ // If no valid DMA size found or available elements are too small, skip.
if (minElementsPerTransfer == std::numeric_limits<int64_t>::max() ||
- innermostDim < minElementsPerTransfer) {
+ availableElements < minElementsPerTransfer) {
return failure();
}
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir
index e9a35e6..bb52bb4 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir
@@ -200,3 +200,218 @@
return %result : tensor<64x128xf32>
}
+
+// -----
+
+// Test: Small innermost dimension with tensor.empty() output CAN be linearized.
+// When output comes from tensor.empty(), we can use total elements instead of
+// innermost dimension for the size check, enabling coalesced DMA.
+
+#gpu_target_linearize = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+ compute = fp32, storage = b32, subgroup = shuffle,
+ max_load_instruction_bits = 128, subgroup_size_choices = [64],
+ max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
+ max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647],
+ dma_sizes = [32]
+>>
+
+#exec_target_linearize = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #gpu_target_linearize}>
+#translation_linearize = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64>
+
+// CHECK-LABEL: func.func @copy_small_innermost_linearized
+// CHECK-SAME: %[[SRC:[a-zA-Z0-9]+]]: tensor<128x16xf32>
+func.func @copy_small_innermost_linearized(%source: tensor<128x16xf32>) -> tensor<128x16xf32>
+ attributes {hal.executable.target = #exec_target_linearize, translation_info = #translation_linearize} {
+ %empty = tensor.empty() : tensor<128x16xf32>
+ %result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma}
+ ins(%source : tensor<128x16xf32>)
+ outs(%empty : tensor<128x16xf32>) -> tensor<128x16xf32>
+
+ // Innermost dimension (16) < minElementsPerTransfer (64), but since output is
+ // tensor.empty(), we use total elements (2048) for the check, which passes.
+ // With 4 warps (256/64), rows are tiled to 32 (128/4), columns kept whole at 16.
+
+ // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<128x16xf32>
+
+ // Warp-level forall: step (32, 16) distributes 128 rows across 4 warps
+ // CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) = (0, 0) to (128, 16) step (32, 16)
+ // CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[EMPTY]]) -> (tensor<128x16xf32>) {
+ // CHECK: %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], 0] [32, 16] [1, 1]
+ // CHECK-SAME: : tensor<128x16xf32> to tensor<32x16xf32>
+ // CHECK: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], 0] [32, 16] [1, 1]
+ // CHECK-SAME: : tensor<128x16xf32> to tensor<32x16xf32>
+
+ // Thread-level forall with 64 lanes
+ // CHECK: %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64)
+ // CHECK-SAME: shared_outs(%[[THREAD_INIT:.+]] = %[[SLICE_DST]]) -> (tensor<32x16xf32>) {
+ // CHECK: scf.forall.in_parallel {
+ // CHECK: iree_gpu.coalesced_gather_dma %[[SLICE_SRC]] into %[[THREAD_INIT]] lane(%[[LANE]])
+ // CHECK-SAME: : tensor<32x16xf32>, tensor<32x16xf32>, index
+ // CHECK: }
+ // CHECK: } {mapping = [#iree_gpu.lane_id<0>]}
+
+ // CHECK: scf.forall.in_parallel {
+ // CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], 0] [32, 16] [1, 1]
+ // CHECK-SAME: : tensor<32x16xf32> into tensor<128x16xf32>
+ // CHECK: }
+ // CHECK: }
+
+ // CHECK: return %[[WARP_RESULT]]
+ // CHECK-NOT: linalg.copy
+
+ return %result : tensor<128x16xf32>
+}
+
+// -----
+
+// Test: 1D tensor copy distributes warps across the single dimension.
+// This tests the 1D tile size computation logic for flattened copies.
+
+#gpu_target_1d = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+ compute = fp32, storage = b32, subgroup = shuffle,
+ max_load_instruction_bits = 128, subgroup_size_choices = [64],
+ max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
+ max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647],
+ dma_sizes = [32]
+>>
+
+#exec_target_1d = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #gpu_target_1d}>
+#translation_1d = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64>
+
+// CHECK-LABEL: func.func @copy_1d_tensor
+// CHECK-SAME: %[[SRC:[a-zA-Z0-9]+]]: tensor<2048xf32>
+func.func @copy_1d_tensor(%source: tensor<2048xf32>) -> tensor<2048xf32>
+ attributes {hal.executable.target = #exec_target_1d, translation_info = #translation_1d} {
+ %empty = tensor.empty() : tensor<2048xf32>
+ %result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma}
+ ins(%source : tensor<2048xf32>)
+ outs(%empty : tensor<2048xf32>) -> tensor<2048xf32>
+
+ // With 4 warps (256/64) and 2048 elements:
+ // - Tile size = ceil(2048/4) = 512 elements per warp
+ // - Step = 512, distributing the single dimension across warps
+
+ // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<2048xf32>
+
+ // Warp-level forall: step (512) distributes 2048 elements across 4 warps
+ // CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV:.+]]) = (0) to (2048) step (512)
+ // CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[EMPTY]]) -> (tensor<2048xf32>) {
+ // CHECK: %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV]]] [512] [1]
+ // CHECK-SAME: : tensor<2048xf32> to tensor<512xf32>
+ // CHECK: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV]]] [512] [1]
+ // CHECK-SAME: : tensor<2048xf32> to tensor<512xf32>
+
+ // Thread-level forall with 64 lanes
+ // CHECK: %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64)
+ // CHECK-SAME: shared_outs(%[[THREAD_INIT:.+]] = %[[SLICE_DST]]) -> (tensor<512xf32>) {
+ // CHECK: scf.forall.in_parallel {
+ // CHECK: iree_gpu.coalesced_gather_dma %[[SLICE_SRC]] into %[[THREAD_INIT]] lane(%[[LANE]])
+ // CHECK-SAME: : tensor<512xf32>, tensor<512xf32>, index
+ // CHECK: }
+ // CHECK: } {mapping = [#iree_gpu.lane_id<0>]}
+
+ // CHECK: scf.forall.in_parallel {
+ // CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV]]] [512] [1]
+ // CHECK-SAME: : tensor<512xf32> into tensor<2048xf32>
+ // CHECK: }
+ // CHECK: }
+
+ // CHECK: return %[[WARP_RESULT]]
+ // CHECK-NOT: linalg.copy
+
+ return %result : tensor<2048xf32>
+}
+
+// -----
+
+// Negative test: Small innermost dimension with non-tensor.empty output should
+// NOT be linearized. The copy should remain unchanged because:
+// 1. Innermost dim (16) < minElementsPerTransfer (64)
+// 2. Output is a function argument, not tensor.empty, so we can't linearize
+
+#gpu_target_no_linearize = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+ compute = fp32, storage = b32, subgroup = shuffle,
+ max_load_instruction_bits = 128, subgroup_size_choices = [64],
+ max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
+ max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647],
+ dma_sizes = [32]
+>>
+
+#exec_target_no_linearize = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #gpu_target_no_linearize}>
+#translation_no_linearize = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64>
+
+// CHECK-LABEL: func.func @copy_small_innermost_no_linearize
+// CHECK-SAME: %[[SRC:[a-zA-Z0-9]+]]: tensor<128x16xf32>
+// CHECK-SAME: %[[DST:[a-zA-Z0-9]+]]: tensor<128x16xf32>
+func.func @copy_small_innermost_no_linearize(%source: tensor<128x16xf32>, %dest: tensor<128x16xf32>) -> tensor<128x16xf32>
+ attributes {hal.executable.target = #exec_target_no_linearize, translation_info = #translation_no_linearize} {
+ %result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma}
+ ins(%source : tensor<128x16xf32>)
+ outs(%dest : tensor<128x16xf32>) -> tensor<128x16xf32>
+
+ // Innermost dimension (16) < minElementsPerTransfer (64), and output is not
+ // tensor.empty(), so linearization is not possible. The copy should remain.
+
+ // CHECK: %[[RESULT:.+]] = linalg.copy
+ // CHECK-SAME: ins(%[[SRC]] : tensor<128x16xf32>)
+ // CHECK-SAME: outs(%[[DST]] : tensor<128x16xf32>)
+ // CHECK: return %[[RESULT]]
+
+ return %result : tensor<128x16xf32>
+}
+
+// -----
+
+// Test: Copy with extract_slice input (source from a slice of a larger tensor).
+// The copy should be converted to coalesced DMA when the input comes from an
+// extract_slice with contiguous innermost dimensions.
+
+#gpu_target_extract_input = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
+ compute = fp32, storage = b32, subgroup = shuffle,
+ max_load_instruction_bits = 128, subgroup_size_choices = [64],
+ max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024,
+ max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647],
+ dma_sizes = [32]
+>>
+
+#exec_target_extract_input = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #gpu_target_extract_input}>
+#translation_extract_input = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64>
+
+// CHECK-LABEL: func.func @copy_with_extract_slice_input
+// CHECK-SAME: %[[LARGE_SRC:[a-zA-Z0-9]+]]: tensor<256x128xf32>
+func.func @copy_with_extract_slice_input(%large_source: tensor<256x128xf32>) -> tensor<64x128xf32>
+ attributes {hal.executable.target = #exec_target_extract_input, translation_info = #translation_extract_input} {
+ // Extract a contiguous slice from the larger source tensor.
+ // The innermost dimension (128) is fully taken, so this is a contiguous slice.
+ %c32 = arith.constant 32 : index
+ %slice = tensor.extract_slice %large_source[%c32, 0] [64, 128] [1, 1]
+ : tensor<256x128xf32> to tensor<64x128xf32>
+
+ %empty = tensor.empty() : tensor<64x128xf32>
+ %result = linalg.copy {lowering_config = #iree_gpu.use_global_load_dma}
+ ins(%slice : tensor<64x128xf32>)
+ outs(%empty : tensor<64x128xf32>) -> tensor<64x128xf32>
+
+ // The copy should be converted to coalesced DMA.
+ // With 4 warps (256/64) and 64x128 tensor:
+ // - Rows are tiled: step = 64/4 = 16
+ // - Columns are NOT tiled (step = 128, full dimension) to ensure contiguous subviews
+
+ // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<64x128xf32>
+ // CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) = (0, 0) to (64, 128) step (16, 128)
+ // CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[EMPTY]]) -> (tensor<64x128xf32>) {
+
+ // The source slice is further sliced for each warp
+ // CHECK: %[[SLICE_SRC:.+]] = tensor.extract_slice
+ // CHECK: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]]
+
+ // Thread-level forall with 64 lanes
+ // CHECK: %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64)
+ // CHECK: iree_gpu.coalesced_gather_dma
+ // CHECK: } {mapping = [#iree_gpu.lane_id<0>]}
+
+ // CHECK: return %[[WARP_RESULT]]
+ // CHECK-NOT: linalg.copy
+
+ return %result : tensor<64x128xf32>
+}