[LLVMGPU] Set prefetching on translation info (#17744) This patch makes prefetch_shared_memory part of translation_info config dictionary, allowing us to control prefetching at dispatch level, instead of globally turning it on/off. Prefetching is still off by default, the flag makes KernelConfig add prefetch_shared_memory unit attribute to config dictionary.

commit: 9da0309b0491df57629a2177ab1dbec4aa73ae6e [log] [tgz]
author: Kunwar Grover <groverkss@gmail.com> Wed Jun 26 16:56:44 2024 +0100
committer: GitHub <noreply@github.com> Wed Jun 26 16:56:44 2024 +0100
tree: 2eff30991d4cebc3f495e258fd20fdf4374cddf3
parent: c62fc9dd9d9bb82caf049c8830840617bdcdc40f [diff]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index a6415cd..5858759 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp

@@ -15,6 +15,7 @@
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h"
 #include "iree/compiler/Codegen/Interfaces/UKernelOpInterface.h"
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/LinalgOpInfo.h"
@@ -70,6 +71,11 @@
     // TODO: We should get this value from the target's parallelism.
     llvm::cl::init(512 * 512));
 
+static llvm::cl::opt<bool> clLLVMGPUEnablePrefetch(
+    "iree-llvmgpu-enable-prefetch",
+    llvm::cl::desc("Enable prefetch in the vector distribute pipeline"),
+    llvm::cl::init(false));
+
 namespace {
 
 using CodeGenPipeline = IREE::Codegen::DispatchLoweringPassPipeline;
@@ -356,6 +362,14 @@
       schedule->nWarpCount);
   SmallVector<NamedAttribute, 1> attrs;
   attrs.emplace_back(StringAttr::get(context, "mma_schedule"), scheduleAttr);
+
+  // Prefetch shared memory if requested.
+  if (clLLVMGPUEnablePrefetch) {
+    attrs.emplace_back(
+        StringAttr::get(context, LLVMGPUAttrNames::kPrefetchSharedMemory),
+        UnitAttr::get(context));
+  }
+
   auto configDict = DictionaryAttr::get(context, attrs);
 
   return setOpConfigAndEntryPointFnTranslation(
@@ -566,6 +580,14 @@
       schedule->nWarpCount);
   SmallVector<NamedAttribute, 1> attrs;
   attrs.emplace_back(StringAttr::get(context, "mma_schedule"), scheduleAttr);
+
+  // Prefetch shared memory if requested.
+  if (clLLVMGPUEnablePrefetch) {
+    attrs.emplace_back(
+        StringAttr::get(context, LLVMGPUAttrNames::kPrefetchSharedMemory),
+        UnitAttr::get(context));
+  }
+
   auto configDict = DictionaryAttr::get(context, attrs);
 
   return setOpConfigAndEntryPointFnTranslation(entryPoint, op, tileSizes,

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
index 2fb3295..6ee91d5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp

@@ -88,6 +88,8 @@
   if (DictionaryAttr config = translationInfo.getConfiguration()) {
     if (config.contains(LLVMGPUAttrNames::kNoReduceSharedMemoryBankConflicts))
       pipelineOptions.enableReduceSharedMemoryBankConflicts = false;
+    if (config.contains(LLVMGPUAttrNames::kPrefetchSharedMemory))
+      pipelineOptions.prefetchSharedMemory = true;
     if (config.contains(LLVMGPUAttrNames::kReorderWorkgroups)) {
       // Get the workgroups reorder config and enable the workgroup reordering.
       Attribute reorderWorkgroupOption =

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index a417e31..1698207 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp

@@ -72,11 +72,6 @@
                    "allocated for the given target"),
     llvm::cl::init(163 * 1024));
 
-static llvm::cl::opt<bool> clLLVMGPUEnablePrefetch(
-    "iree-llvmgpu-enable-prefetch",
-    llvm::cl::desc("Enable prefetch in the vector distribute pipeline"),
-    llvm::cl::init(false));
-
 llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
                               const LLVMGPUPipelineOptions &options) {
   StringRef reorderStr = "<not set>";
@@ -93,7 +88,8 @@
   }
 
   return os << "{" << "enableReduceSharedMemoryBankConflicts = "
-            << options.enableReduceSharedMemoryBankConflicts
+            << options.enableReduceSharedMemoryBankConflicts << ", "
+            << ", prefetchSharedMemory = " << options.prefetchSharedMemory
             << ", reorderWorkgroupsStrategy = " << reorderStr
             << ", enableUkernels = " << options.enableUkernels << "}";
 }
@@ -790,7 +786,7 @@
     funcPassManager.addPass(createGPUReduceBankConflictsPass(options));
   }
 
-  if (clLLVMGPUEnablePrefetch) {
+  if (options.prefetchSharedMemory) {
     funcPassManager.addPass(createLLVMGPUPrefetchSharedMemoryPass());
   }
   funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
index 0492d49..488705f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h

@@ -29,10 +29,12 @@
 inline constexpr StringLiteral kReorderWorkgroups = "reorder_workgroups";
 inline constexpr StringLiteral kNoReduceSharedMemoryBankConflicts =
     "no_reduce_shared_memory_bank_conflicts";
+inline constexpr StringLiteral kPrefetchSharedMemory = "prefetch_shared_memory";
 } //  namespace LLVMGPUAttrNames
 
 struct LLVMGPUPipelineOptions {
   bool enableReduceSharedMemoryBankConflicts = true;
+  bool prefetchSharedMemory = false;
   bool enableUkernels = false;
   std::optional<ReorderWorkgroupsStrategy> reorderStrategy;
 };

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
index a8bfa71..d0254d0 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir

@@ -44,6 +44,7 @@
 //       CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
 //  CHECK-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
 //  CHECK-SAME:     subgroup_m_count = 2, subgroup_n_count = 2>
+//  CHECK-SAME:   prefetch_shared_memory
 
 //    CHECK-LABEL: func.func @matmul_256x256x256_f16_f32()
 //     CHECK-SAME:    translation_info = #[[$TRANSLATION]]
@@ -91,6 +92,7 @@
 //       CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
 //  CHECK-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
 //  CHECK-SAME:     subgroup_m_count = 2, subgroup_n_count = 2>
+//  CHECK-SAME:   prefetch_shared_memory
 
 //    CHECK-LABEL: func.func @matmul_256x256x256_f16_f16()
 //     CHECK-SAME:     translation_info = #[[$TRANSLATION]]
@@ -156,6 +158,8 @@
 }
 
 //          CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64
+//     CHECK-SAME: prefetch_shared_memory
+
 //          CHECK: func @expanded_matmul_transpose_b
 //     CHECK-SAME:     translation_info = #[[TRANSLATION]]
 // This has more than 2 iteartions. So we have prefetching enabled for this case. Due to
@@ -272,6 +276,7 @@
 //       CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
 //  CHECK-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
 //  CHECK-SAME:     subgroup_m_count = 2, subgroup_n_count = 2>
+//  CHECK-SAME: prefetch_shared_memory
 
 //    CHECK-LABEL: func.func @generic_2x1024x20x64x1280_f16
 // This has more than 2 iteartions. So we have prefetching enabled for this case. Due to
@@ -321,7 +326,7 @@
 //       RDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32
 //  RDNA3-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>,
 //  RDNA3-SAME:     subgroup_m_count = 2, subgroup_n_count = 2>
-
+//  RDNA3-SAME:   prefetch_shared_memory
 
 //    RDNA3-LABEL: func.func @matmul_256x256x256_f16_f32
 //     RDNA3-SAME:    translation_info = #[[$TRANSLATION]]
@@ -373,6 +378,7 @@
 // CHECK:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64
 // CHECK-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
 // CHECK-SAME:    subgroup_m_count = 1, subgroup_n_count = 1>
+// CHECK-SAME:    prefetch_shared_memory
 
 // CHECK-LABEL: func.func @unaligned_nk_batch_matmul()
 // CHECK-SAME:    translation_info = #[[$TRANSLATION]]
@@ -462,6 +468,7 @@
 // CHECK:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64
 // CHECK-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
 // CHECK-SAME:    subgroup_m_count = 1, subgroup_n_count = 4>
+// CHECK-SAME:    prefetch_shared_memory
 
 // CHECK-LABEL: func.func @contract_schedule_considering_read_layout()
 // CHECK-SAME:    translation_info = #[[$TRANSLATION]]
commit	9da0309b0491df57629a2177ab1dbec4aa73ae6e	[log] [tgz]
author	Kunwar Grover <groverkss@gmail.com>	Wed Jun 26 16:56:44 2024 +0100
committer	GitHub <noreply@github.com>	Wed Jun 26 16:56:44 2024 +0100
tree	2eff30991d4cebc3f495e258fd20fdf4374cddf3
parent	c62fc9dd9d9bb82caf049c8830840617bdcdc40f [diff]