[LLVMGPU] Set prefetching on translation info (#17744)
This patch makes prefetch_shared_memory part of translation_info config
dictionary, allowing us to control prefetching at dispatch level,
instead of globally turning it on/off. Prefetching is still off by
default, the flag makes KernelConfig add prefetch_shared_memory unit
attribute to config dictionary.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index a6415cd..5858759 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -15,6 +15,7 @@
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
#include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h"
#include "iree/compiler/Codegen/Interfaces/UKernelOpInterface.h"
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h"
#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Codegen/Utils/LinalgOpInfo.h"
@@ -70,6 +71,11 @@
// TODO: We should get this value from the target's parallelism.
llvm::cl::init(512 * 512));
+static llvm::cl::opt<bool> clLLVMGPUEnablePrefetch(
+ "iree-llvmgpu-enable-prefetch",
+ llvm::cl::desc("Enable prefetch in the vector distribute pipeline"),
+ llvm::cl::init(false));
+
namespace {
using CodeGenPipeline = IREE::Codegen::DispatchLoweringPassPipeline;
@@ -356,6 +362,14 @@
schedule->nWarpCount);
SmallVector<NamedAttribute, 1> attrs;
attrs.emplace_back(StringAttr::get(context, "mma_schedule"), scheduleAttr);
+
+ // Prefetch shared memory if requested.
+ if (clLLVMGPUEnablePrefetch) {
+ attrs.emplace_back(
+ StringAttr::get(context, LLVMGPUAttrNames::kPrefetchSharedMemory),
+ UnitAttr::get(context));
+ }
+
auto configDict = DictionaryAttr::get(context, attrs);
return setOpConfigAndEntryPointFnTranslation(
@@ -566,6 +580,14 @@
schedule->nWarpCount);
SmallVector<NamedAttribute, 1> attrs;
attrs.emplace_back(StringAttr::get(context, "mma_schedule"), scheduleAttr);
+
+ // Prefetch shared memory if requested.
+ if (clLLVMGPUEnablePrefetch) {
+ attrs.emplace_back(
+ StringAttr::get(context, LLVMGPUAttrNames::kPrefetchSharedMemory),
+ UnitAttr::get(context));
+ }
+
auto configDict = DictionaryAttr::get(context, attrs);
return setOpConfigAndEntryPointFnTranslation(entryPoint, op, tileSizes,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
index 2fb3295..6ee91d5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
@@ -88,6 +88,8 @@
if (DictionaryAttr config = translationInfo.getConfiguration()) {
if (config.contains(LLVMGPUAttrNames::kNoReduceSharedMemoryBankConflicts))
pipelineOptions.enableReduceSharedMemoryBankConflicts = false;
+ if (config.contains(LLVMGPUAttrNames::kPrefetchSharedMemory))
+ pipelineOptions.prefetchSharedMemory = true;
if (config.contains(LLVMGPUAttrNames::kReorderWorkgroups)) {
// Get the workgroups reorder config and enable the workgroup reordering.
Attribute reorderWorkgroupOption =
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index a417e31..1698207 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -72,11 +72,6 @@
"allocated for the given target"),
llvm::cl::init(163 * 1024));
-static llvm::cl::opt<bool> clLLVMGPUEnablePrefetch(
- "iree-llvmgpu-enable-prefetch",
- llvm::cl::desc("Enable prefetch in the vector distribute pipeline"),
- llvm::cl::init(false));
-
llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
const LLVMGPUPipelineOptions &options) {
StringRef reorderStr = "<not set>";
@@ -93,7 +88,8 @@
}
return os << "{" << "enableReduceSharedMemoryBankConflicts = "
- << options.enableReduceSharedMemoryBankConflicts
+ << options.enableReduceSharedMemoryBankConflicts << ", "
+ << ", prefetchSharedMemory = " << options.prefetchSharedMemory
<< ", reorderWorkgroupsStrategy = " << reorderStr
<< ", enableUkernels = " << options.enableUkernels << "}";
}
@@ -790,7 +786,7 @@
funcPassManager.addPass(createGPUReduceBankConflictsPass(options));
}
- if (clLLVMGPUEnablePrefetch) {
+ if (options.prefetchSharedMemory) {
funcPassManager.addPass(createLLVMGPUPrefetchSharedMemoryPass());
}
funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
index 0492d49..488705f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -29,10 +29,12 @@
inline constexpr StringLiteral kReorderWorkgroups = "reorder_workgroups";
inline constexpr StringLiteral kNoReduceSharedMemoryBankConflicts =
"no_reduce_shared_memory_bank_conflicts";
+inline constexpr StringLiteral kPrefetchSharedMemory = "prefetch_shared_memory";
} // namespace LLVMGPUAttrNames
struct LLVMGPUPipelineOptions {
bool enableReduceSharedMemoryBankConflicts = true;
+ bool prefetchSharedMemory = false;
bool enableUkernels = false;
std::optional<ReorderWorkgroupsStrategy> reorderStrategy;
};
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
index a8bfa71..d0254d0 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
@@ -44,6 +44,7 @@
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
+// CHECK-SAME: prefetch_shared_memory
// CHECK-LABEL: func.func @matmul_256x256x256_f16_f32()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
@@ -91,6 +92,7 @@
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
+// CHECK-SAME: prefetch_shared_memory
// CHECK-LABEL: func.func @matmul_256x256x256_f16_f16()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
@@ -156,6 +158,8 @@
}
// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64
+// CHECK-SAME: prefetch_shared_memory
+
// CHECK: func @expanded_matmul_transpose_b
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// This has more than 2 iteartions. So we have prefetching enabled for this case. Due to
@@ -272,6 +276,7 @@
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
+// CHECK-SAME: prefetch_shared_memory
// CHECK-LABEL: func.func @generic_2x1024x20x64x1280_f16
// This has more than 2 iteartions. So we have prefetching enabled for this case. Due to
@@ -321,7 +326,7 @@
// RDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32
// RDNA3-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>,
// RDNA3-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
-
+// RDNA3-SAME: prefetch_shared_memory
// RDNA3-LABEL: func.func @matmul_256x256x256_f16_f32
// RDNA3-SAME: translation_info = #[[$TRANSLATION]]
@@ -373,6 +378,7 @@
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
// CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 1>
+// CHECK-SAME: prefetch_shared_memory
// CHECK-LABEL: func.func @unaligned_nk_batch_matmul()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
@@ -462,6 +468,7 @@
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
// CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 4>
+// CHECK-SAME: prefetch_shared_memory
// CHECK-LABEL: func.func @contract_schedule_considering_read_layout()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]