[NFC][GPU] Move LLVMGPUPipelineOptions to iree_gpu dialect (#18458)
This moves `LLVMGPUPipelineOptions` to `Codegen/Dialect/GPU` so that
pipeline options can be set by iree_gpu lowering configuration logic
(like `setMatmulLoweringConfig` in `ConfigUtils.cpp`).
A new attribute `GPUPipelineOptionsAttr` is added, with optional
parameters defined for each existing pipeline option. The assembly of
the pipline attributes has changed, since the options are now part of
the `iree_gpu` dialect. For the purposes of setting user configurations
with transform dialect this PR changes the following:
All pipeline option attributes should now be contained in a single
`#iree_gpu.pipeline_options<>`
- `reorder_workgroups = "none"/"swizzle"/"transpose"` becomes one of the
pipeline options `reorder_workgroups_strategy = None/Swizzle/Transpose`
- `prefetch_shared_memory` becomes the pipeline option
`prefetch_shared_memory = true/false`
- `no_reduce_shared_memory_bank_conflicts` becomes the pipeline option
`no_reduce_shared_memory_bank_conflicts = true/false`
Example:
After this change, the translation_info config dict changes from
```
{reorder_workgroups = "swizzle", prefetch_shared_memory}
```
to
```
{gpu_pipeline_options =
#iree_gpu.pipeline_options<
prefetch_shared_memory = true, reorder_workgroups_strategy = Swizzle
>}
```
---------
Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
diff --git a/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir b/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir
index 1bdd146..c1ddf72 100644
--- a/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir
+++ b/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir
@@ -636,8 +636,8 @@
workgroup_size = [320, 1, 1] subgroup_size = 64,
{mma_schedule = #iree_gpu.mma_schedule<
intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
- subgroup_m_count = 1, subgroup_n_count = 5>
- , reorder_workgroups = "transpose"}>
+ subgroup_m_count = 1, subgroup_n_count = 5>,
+ gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>}>
> -> !transform.any_param
transform.yield %conv, %config : !transform.any_op, !transform.any_param
}
@@ -656,8 +656,9 @@
workgroup_size = [256, 1, 1] subgroup_size = 64,
{mma_schedule = #iree_gpu.mma_schedule<
intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>,
- subgroup_m_count = 1, subgroup_n_count = 4>
- , reorder_workgroups = "transpose", llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
+ subgroup_m_count = 1, subgroup_n_count = 4>,
+ gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>,
+ llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
> -> !transform.any_param
transform.yield %conv, %config : !transform.any_op, !transform.any_param
}
@@ -676,8 +677,8 @@
workgroup_size = [320, 1, 1] subgroup_size = 64,
{mma_schedule = #iree_gpu.mma_schedule<
intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
- subgroup_m_count = 1, subgroup_n_count = 5>
- , reorder_workgroups = "transpose"}>
+ subgroup_m_count = 1, subgroup_n_count = 5>,
+ gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>}>
> -> !transform.any_param
transform.yield %conv, %config : !transform.any_op, !transform.any_param
}
@@ -696,8 +697,8 @@
workgroup_size = [320, 1, 1] subgroup_size = 64,
{mma_schedule = #iree_gpu.mma_schedule<
intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
- subgroup_m_count = 1, subgroup_n_count = 5>
- , reorder_workgroups = "transpose"}>
+ subgroup_m_count = 1, subgroup_n_count = 5>,
+ gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>}>
> -> !transform.any_param
transform.yield %conv, %config : !transform.any_op, !transform.any_param
}
@@ -716,8 +717,9 @@
workgroup_size = [128, 4, 1] subgroup_size = 64,
{mma_schedule = #iree_gpu.mma_schedule<
intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
- subgroup_m_count = 4, subgroup_n_count = 2>
- , reorder_workgroups = "transpose", llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
+ subgroup_m_count = 4, subgroup_n_count = 2>,
+ gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>,
+ llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
> -> !transform.any_param
transform.yield %conv, %config : !transform.any_op, !transform.any_param
}
@@ -736,8 +738,8 @@
workgroup_size = [320, 1, 1] subgroup_size = 64,
{mma_schedule = #iree_gpu.mma_schedule<
intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
- subgroup_m_count = 1, subgroup_n_count = 5>
- , reorder_workgroups = "transpose"}>
+ subgroup_m_count = 1, subgroup_n_count = 5>,
+ gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>}>
> -> !transform.any_param
transform.yield %conv, %config : !transform.any_op, !transform.any_param
}
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h
index 3ec74b1..02721d6 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h
@@ -101,7 +101,7 @@
std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
createConvertVectorReductionToGPUPass(bool expandSubgroupReduction = true);
-enum class ReorderWorkgroupsStrategy { None, Swizzle, Transpose };
+using IREE::GPU::ReorderWorkgroupsStrategy;
/// Reorders workgroup IDs.
std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
index 900aeee..bb7612e 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
@@ -1353,6 +1353,25 @@
int64_t LaneIdAttr::getRelativeIndex() const { return getDim(); }
//===----------------------------------------------------------------------===//
+// GPU Pipeline Options
+//===----------------------------------------------------------------------===//
+
+GPUPipelineOptionsAttr GPUPipelineOptionsAttr::get(
+ MLIRContext *context, bool prefetchSharedMemory,
+ bool noReduceSharedMemoryBankConflicts,
+ std::optional<ReorderWorkgroupsStrategy> reorderWorkgroupsStrategy) {
+ auto strategyAttr = ReorderWorkgroupsStrategyAttr();
+ if (reorderWorkgroupsStrategy) {
+ strategyAttr =
+ ReorderWorkgroupsStrategyAttr::get(context, *reorderWorkgroupsStrategy);
+ }
+ Builder b(context);
+ return Base::get(context, b.getBoolAttr(prefetchSharedMemory),
+ b.getBoolAttr(noReduceSharedMemoryBankConflicts),
+ strategyAttr);
+}
+
+//===----------------------------------------------------------------------===//
// Attribute Registration
//===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
index cdb7402..968fa73 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
@@ -430,4 +430,54 @@
}];
}
+//===----------------------------------------------------------------------===//
+// GPU Pipeline Options
+//===----------------------------------------------------------------------===//
+
+def IREEGPU_ReorderWorkgroupsStrategyAttr :
+ EnumAttr<IREEGPU_Dialect, IREEGPU_ReorderWorkgroupsStrategy, ""> {
+ let assemblyFormat = "``$value";
+ let cppNamespace = "::mlir::iree_compiler::IREE::GPU";
+}
+
+def IREEGPU_GPUPipelineOptionsAttr : AttrDef<IREEGPU_Dialect, "GPUPipelineOptions"> {
+ let summary = "GPU pipeline options attribute.";
+ let description = [{
+ This attributes describes lowering pipeline specific configuration options:
+ * prefetch_shared_memory: Boolean option indicating whether or not to run
+ the loop prefetching pass in the lowering pipeline.
+ * no_reduce_shared_memory_bank_conflicts: Boolean option indicating whether
+ or not to skip the bank conflict reduction pass in the lowering pipeline.
+ * reorder_workgroups_strategy: Enum attribute indicating which strategy to
+ choose for the workgroup reordering pass. Options are `None`, `Swizzle`,
+ and `Transpose`.
+ }];
+
+ let mnemonic = "pipeline_options";
+ let cppNamespace = "::mlir::iree_compiler::IREE::GPU";
+
+ let parameters = (ins
+ OptionalParameter<"BoolAttr">:$prefetch_shared_memory,
+ OptionalParameter<"BoolAttr">:$no_reduce_shared_memory_bank_conflicts,
+ OptionalParameter<"ReorderWorkgroupsStrategyAttr">:$reorder_workgroups_strategy
+ );
+
+ let builders = [
+ AttrBuilder<(ins
+ CArg<"bool", "false">:$prefetch_shared_memory,
+ CArg<"bool", "false">:$no_reduce_shared_memory_bank_conflicts,
+ CArg<"std::optional<ReorderWorkgroupsStrategy>", "{}">:$reorder_workgroups_strategy)>
+ ];
+
+ let assemblyFormat = "`<` struct(params) `>`";
+
+ let extraClassDeclaration = [{
+ // Returns the key name for GPUPipelineOptionsAttr in the translation info
+ // config dictionary.
+ static StringRef getDictKeyName() {
+ return "gpu_pipeline_options";
+ }
+ }];
+}
+
#endif // IREE_COMPILER_CODEGEN_DIALECT_GPU_IREEGPUATTRS
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
index bdc590a..c147d88 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
@@ -168,4 +168,28 @@
Lane
]>;
+//===----------------------------------------------------------------------===//
+// Pipeline options
+//===----------------------------------------------------------------------===//
+
+class IREEGPU_I32PipelineEnumAttr<string name, string summary, list<I32EnumAttrCase> cases>
+ : I32EnumAttr<name, summary, cases> {
+ let cppNamespace = "::mlir::iree_compiler::IREE::GPU";
+ let genSpecializedAttr = 0;
+}
+
+// ReorderWorkgroups EnumAttrCases.
+def ReorderWorkgroupsNone : I32EnumAttrCase<"None", 0>;
+def ReorderWorkgroupsSwizzle : I32EnumAttrCase<"Swizzle", 1>;
+def ReorderWorkgroupsTranspose : I32EnumAttrCase<"Transpose", 2>;
+
+// EnumAttr for workgroup reordering strategy enums.
+def IREEGPU_ReorderWorkgroupsStrategy : IREEGPU_I32PipelineEnumAttr<"ReorderWorkgroupsStrategy",
+ "Strategy for workgroup reordering", [
+ ReorderWorkgroupsNone,
+ ReorderWorkgroupsSwizzle,
+ ReorderWorkgroupsTranspose
+ ]> {
+}
+
#endif // IREE_COMPILER_CODEGEN_DIALECT_GPU_IREEGPUENUMS
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index cfe13f6..c9bb86c 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -473,4 +473,73 @@
workgroupSize, subgroupSize, DictionaryAttr());
}
+//===----------------------------------------------------------------------===//
+// Lowering Config Attributes
+//===----------------------------------------------------------------------===//
+
+GPUPipelineOptions
+getPipelineOptions(FunctionOpInterface funcOp,
+ IREE::Codegen::TranslationInfoAttr translationInfo) {
+ GPUPipelineOptions pipelineOptions = {};
+ auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
+
+ if (DictionaryAttr config = translationInfo.getConfiguration()) {
+ std::optional<NamedAttribute> maybePipelineOptionsAttr =
+ config.getNamed(GPUPipelineOptionsAttr::getDictKeyName());
+ if (!maybePipelineOptionsAttr.has_value()) {
+ return pipelineOptions;
+ }
+ auto pipelineOptionsAttr =
+ cast<GPUPipelineOptionsAttr>(maybePipelineOptionsAttr->getValue());
+ BoolAttr prefetchSharedMemory =
+ pipelineOptionsAttr.getPrefetchSharedMemory();
+ if (prefetchSharedMemory) {
+ pipelineOptions.prefetchSharedMemory = prefetchSharedMemory.getValue();
+ }
+ BoolAttr noReduceBankConflicts =
+ pipelineOptionsAttr.getNoReduceSharedMemoryBankConflicts();
+ if (noReduceBankConflicts) {
+ pipelineOptions.enableReduceSharedMemoryBankConflicts =
+ !noReduceBankConflicts.getValue();
+ }
+ ReorderWorkgroupsStrategyAttr reorderWorkgroupsStrategy =
+ pipelineOptionsAttr.getReorderWorkgroupsStrategy();
+ if (reorderWorkgroupsStrategy) {
+ pipelineOptions.reorderStrategy = reorderWorkgroupsStrategy.getValue();
+ }
+ }
+
+ pipelineOptions.enableUkernels = targetAttr && hasUkernel(targetAttr);
+
+ LLVM_DEBUG(llvm::dbgs() << "GPU Pipeline Options: " << pipelineOptions
+ << "\n");
+ return pipelineOptions;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+ const GPUPipelineOptions &options) {
+ StringRef reorderStr = "<not set>";
+ if (options.reorderStrategy) {
+ switch (options.reorderStrategy.value()) {
+ case ReorderWorkgroupsStrategy::Transpose:
+ reorderStr = "transpose";
+ break;
+ case ReorderWorkgroupsStrategy::Swizzle:
+ reorderStr = "swizzle";
+ break;
+ case ReorderWorkgroupsStrategy::None:
+ reorderStr = "none";
+ break;
+ default:
+ assert(false && "Unhandled reorder option");
+ }
+ }
+
+ return os << "{" << "enableReduceSharedMemoryBankConflicts = "
+ << options.enableReduceSharedMemoryBankConflicts << ", "
+ << ", prefetchSharedMemory = " << options.prefetchSharedMemory
+ << ", reorderWorkgroupsStrategy = " << reorderStr
+ << ", enableUkernels = " << options.enableUkernels << "}";
+}
+
} // namespace mlir::iree_compiler::IREE::GPU
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h
index f87fab9..f9f3d37 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h
@@ -7,6 +7,7 @@
#ifndef IREE_COMPILER_CODEGEN_DIALECT_GPU_TARGETUTILS_CONFIGUTILS_H_
#define IREE_COMPILER_CODEGEN_DIALECT_GPU_TARGETUTILS_CONFIGUTILS_H_
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
#include "mlir/IR/Operation.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
@@ -26,6 +27,26 @@
mlir::FunctionOpInterface entryPoint,
Operation *op);
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Options
+//===----------------------------------------------------------------------===//
+
+using IREE::GPU::ReorderWorkgroupsStrategy;
+
+struct GPUPipelineOptions {
+ bool enableReduceSharedMemoryBankConflicts = true;
+ bool prefetchSharedMemory = false;
+ bool enableUkernels = false;
+ std::optional<ReorderWorkgroupsStrategy> reorderStrategy;
+};
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+ const GPUPipelineOptions &options);
+
+GPUPipelineOptions
+getPipelineOptions(FunctionOpInterface funcOp,
+ IREE::Codegen::TranslationInfoAttr translationInfo);
+
} // namespace mlir::iree_compiler::IREE::GPU
#endif // IREE_COMPILER_CODEGEN_DIALECT_GPU_TARGETUTILS_CONFIGUTILS_H_
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 118ce2b..ccf61a5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -13,6 +13,7 @@
#include "iree/compiler/Codegen/Common/GPU/GPUHeuristics.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
#include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h"
#include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h"
#include "iree/compiler/Codegen/Interfaces/UKernelOpInterface.h"
@@ -380,9 +381,14 @@
// Prefetch shared memory if requested.
if (clLLVMGPUEnablePrefetch) {
+ auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get(
+ context, /*prefetchSharedMemory=*/true,
+ /*no_reduce_shared_memory_bank_conflicts=*/false,
+ /*reorder_workgroups_strategy=*/std::nullopt);
attrs.emplace_back(
- StringAttr::get(context, LLVMGPUAttrNames::kPrefetchSharedMemory),
- UnitAttr::get(context));
+ StringAttr::get(context,
+ IREE::GPU::GPUPipelineOptionsAttr::getDictKeyName()),
+ pipelineOptions);
}
auto configDict = DictionaryAttr::get(context, attrs);
@@ -610,9 +616,14 @@
// Prefetch shared memory if requested.
if (clLLVMGPUEnablePrefetch) {
+ auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get(
+ context, /*prefetchSharedMemory=*/true,
+ /*no_reduce_shared_memory_bank_conflicts=*/false,
+ /*reorder_workgroups_strategy=*/std::nullopt);
attrs.emplace_back(
- StringAttr::get(context, LLVMGPUAttrNames::kPrefetchSharedMemory),
- UnitAttr::get(context));
+ StringAttr::get(context,
+ IREE::GPU::GPUPipelineOptionsAttr::getDictKeyName()),
+ pipelineOptions);
}
auto configDict = DictionaryAttr::get(context, attrs);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
index dc96c92..640f1c3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
@@ -8,6 +8,7 @@
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
+#include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h"
#include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h"
#include "iree/compiler/Codegen/LLVMGPU/KernelConfig.h"
#include "iree/compiler/Codegen/LLVMGPU/Passes.h"
@@ -76,50 +77,6 @@
void runOnOperation() override;
};
-
-static LLVMGPUPipelineOptions
-getPipelineOptions(FunctionOpInterface funcOp,
- IREE::Codegen::TranslationInfoAttr translationInfo) {
- LLVMGPUPipelineOptions pipelineOptions = {};
- auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
-
- LLVM_DEBUG(llvm::dbgs() << "Translation Info: " << translationInfo << "\n");
- LLVM_DEBUG(llvm::dbgs() << "Target Attr: " << targetAttr << "\n");
-
- if (DictionaryAttr config = translationInfo.getConfiguration()) {
- if (config.contains(LLVMGPUAttrNames::kNoReduceSharedMemoryBankConflicts))
- pipelineOptions.enableReduceSharedMemoryBankConflicts = false;
- if (config.contains(LLVMGPUAttrNames::kPrefetchSharedMemory))
- pipelineOptions.prefetchSharedMemory = true;
- if (config.contains(LLVMGPUAttrNames::kReorderWorkgroups)) {
- // Get the workgroups reorder config and enable the workgroup reordering.
- Attribute reorderWorkgroupOption =
- config.get(LLVMGPUAttrNames::kReorderWorkgroups);
- if (!isa<StringAttr>(reorderWorkgroupOption))
- funcOp.emitOpError() << "'" << LLVMGPUAttrNames::kReorderWorkgroups
- << "' is expected to be a string attribute";
- StringRef reorderStr = llvm::cast<StringAttr>(reorderWorkgroupOption);
- if (reorderStr == "transpose") {
- pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::Transpose;
- } else if (reorderStr == "swizzle") {
- pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::Swizzle;
- } else {
- if (reorderStr != "none")
- funcOp.emitOpError()
- << "Unknown " << LLVMGPUAttrNames::kReorderWorkgroups
- << "value: " << reorderWorkgroupOption;
- else
- pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::None;
- }
- }
- }
-
- pipelineOptions.enableUkernels = targetAttr && hasUkernel(targetAttr);
-
- LLVM_DEBUG(llvm::dbgs() << "LLVMGPU Pipeline Options: " << pipelineOptions
- << "\n");
- return pipelineOptions;
-}
} // namespace
void LLVMGPULowerExecutableTargetPass::runOnOperation() {
@@ -138,8 +95,8 @@
}
OpPassManager &pipeline = maybePipeline.value();
- LLVMGPUPipelineOptions pipelineOptions =
- getPipelineOptions(funcOp, translationInfo);
+ IREE::GPU::GPUPipelineOptions pipelineOptions =
+ IREE::GPU::getPipelineOptions(funcOp, translationInfo);
switch (translationInfo.getDispatchLoweringPassPipeline()) {
case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUDefault:
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index c95c484..20471fd 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -75,28 +75,6 @@
llvm::cl::desc("Enable implicit gemm for convolutions."),
llvm::cl::init(false));
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
- const LLVMGPUPipelineOptions &options) {
- StringRef reorderStr = "<not set>";
- if (options.reorderStrategy) {
- if (options.reorderStrategy == ReorderWorkgroupsStrategy::Transpose) {
- reorderStr = "transpose";
- } else if (options.reorderStrategy == ReorderWorkgroupsStrategy::Swizzle) {
- reorderStr = "swizzle";
- } else {
- assert(options.reorderStrategy == ReorderWorkgroupsStrategy::None &&
- "Unhandled reorder option");
- reorderStr = "none";
- }
- }
-
- return os << "{" << "enableReduceSharedMemoryBankConflicts = "
- << options.enableReduceSharedMemoryBankConflicts << ", "
- << ", prefetchSharedMemory = " << options.prefetchSharedMemory
- << ", reorderWorkgroupsStrategy = " << reorderStr
- << ", enableUkernels = " << options.enableUkernels << "}";
-}
-
//===----------------------------------------------------------------------===//
// Bufferization Configuration
//===----------------------------------------------------------------------===//
@@ -488,7 +466,7 @@
//===---------------------------------------------------------------------===//
void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager,
- const LLVMGPUPipelineOptions &options) {
+ const GPUPipelineOptions &options) {
tileAndDistributeToWorkgroup(funcPassManager);
funcPassManager.addPass(createCanonicalizerPass());
@@ -549,7 +527,7 @@
//===---------------------------------------------------------------------===//
void addGPUMatmulTensorCorePassPipeline(OpPassManager &funcPassManager,
- const LLVMGPUPipelineOptions &options,
+ const GPUPipelineOptions &options,
unsigned pipelineDepth) {
tileAndBufferize(funcPassManager);
@@ -619,7 +597,7 @@
//===---------------------------------------------------------------------===//
void addGPUMatmulTensorCoreMmaSyncPassPipeline(
- OpPassManager &funcPassManager, const LLVMGPUPipelineOptions &options,
+ OpPassManager &funcPassManager, const GPUPipelineOptions &options,
unsigned pipelineDepth) {
tileAndBufferize(funcPassManager);
@@ -686,7 +664,7 @@
//===---------------------------------------------------------------------===//
void addGPUTransposePassPipeline(OpPassManager &funcPassManager,
- const LLVMGPUPipelineOptions &options) {
+ const GPUPipelineOptions &options) {
tileAndDistributeToWorkgroup(funcPassManager);
funcPassManager.addPass(createCanonicalizerPass());
@@ -790,7 +768,7 @@
}
void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
- const LLVMGPUPipelineOptions &options,
+ const GPUPipelineOptions &options,
bool usePadToModelSharedMemcpy) {
tileAndDistributeToWorkgroup(funcPassManager);
@@ -965,7 +943,7 @@
}
void addGPUDefaultPassPipeline(OpPassManager &funcPassManager,
- const LLVMGPUPipelineOptions &options) {
+ const GPUPipelineOptions &options) {
ConvertToDestinationPassingStylePassOptions dpsOptions;
dpsOptions.useWARForCooperativeMatrixCodegen = true;
tileAndDistributeToWorkgroup(funcPassManager, dpsOptions);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
index 2804a32..614d026 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -14,33 +14,12 @@
#include "iree/compiler/Codegen/Common/GPU/Passes.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h"
#include "mlir/Pass/Pass.h"
namespace mlir::iree_compiler {
-//===----------------------------------------------------------------------===//
-// Pass Pipeline Options
-//===----------------------------------------------------------------------===//
-
-/// Named attributes used in the `translation_info`'s config dictionary
-/// attribute. These are used to override default pass heuristics at the
-/// function granularity.
-namespace LLVMGPUAttrNames {
-inline constexpr StringLiteral kReorderWorkgroups = "reorder_workgroups";
-inline constexpr StringLiteral kNoReduceSharedMemoryBankConflicts =
- "no_reduce_shared_memory_bank_conflicts";
-inline constexpr StringLiteral kPrefetchSharedMemory = "prefetch_shared_memory";
-} // namespace LLVMGPUAttrNames
-
-struct LLVMGPUPipelineOptions {
- bool enableReduceSharedMemoryBankConflicts = true;
- bool prefetchSharedMemory = false;
- bool enableUkernels = false;
- std::optional<ReorderWorkgroupsStrategy> reorderStrategy;
-};
-
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
- const LLVMGPUPipelineOptions &options);
+using IREE::GPU::GPUPipelineOptions;
//----------------------------------------------------------------------------//
// LLVMGPU backend Pass Pipelines.
@@ -48,16 +27,16 @@
/// Lowering using SIMT CUDA core operations.
void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager,
- const LLVMGPUPipelineOptions &options);
+ const GPUPipelineOptions &options);
/// Lowering using mma.sync Tensor Core operations.
void addGPUMatmulTensorCoreMmaSyncPassPipeline(
- OpPassManager &funcPassManager, const LLVMGPUPipelineOptions &options,
+ OpPassManager &funcPassManager, const GPUPipelineOptions &options,
unsigned pipelineDepth);
/// Lowering using wmma Tensor Core operations.
void addGPUMatmulTensorCorePassPipeline(OpPassManager &funcPassManager,
- const LLVMGPUPipelineOptions &options,
+ const GPUPipelineOptions &options,
unsigned pipelineDepth);
void addGPUPackUnPackPasses(OpPassManager &funcPassManager);
@@ -77,7 +56,7 @@
/// Lowering transpose using shared memory.
void addGPUTransposePassPipeline(OpPassManager &funcPassManager,
- const LLVMGPUPipelineOptions &options);
+ const GPUPipelineOptions &options);
/// Lowering calling vectorization patterns. Expects pass manager to be a
/// module-level pass manager.
@@ -89,7 +68,7 @@
/// Lowering based on vector distribution patterns.
void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
- const LLVMGPUPipelineOptions &options,
+ const GPUPipelineOptions &options,
bool usePadToModelSharedMemcpy);
/// Lowering reductions to warp reductions.
@@ -97,7 +76,7 @@
/// Default pass pipeline on GPU, currently used only for the ukernel path.
void addGPUDefaultPassPipeline(OpPassManager &funcPassManager,
- const LLVMGPUPipelineOptions &options);
+ const GPUPipelineOptions &options);
/// Pass pipeline to lower IREE HAL executables without tiling and distribution.
void addGPUBaseLoweringPassPipeline(OpPassManager &pm);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir
index 8cb1b2e..e5a7bf1 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir
@@ -2,20 +2,20 @@
// RUN: --iree-codegen-reorder-workgroups-strategy=transpose \
// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s --check-prefix=OPT-OUT
-// Check that applying `reorder_workgroups` enables or disables workgroup reordering.
+// Check that applying `ReorderWorkgroups*` enables or disables workgroup reordering.
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --iree-codegen-llvmgpu-use-vector-distribution \
// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s --check-prefix=OPT-IN
-// Check that applying the `no_reduce_shared_memory_bank_conflicts` unit attribute disables shared memory padding.
+// Check that applying the `no_reduce_shared_memory_bank_conflicts` pipeline option attribute disables shared memory padding.
// OPT-OUT: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+// OPT-OUT-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<no_reduce_shared_memory_bank_conflicts = true>
// OPT-OUT-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-// OPT-OUT-SAME: no_reduce_shared_memory_bank_conflicts
// OPT-IN: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+// OPT-IN-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<no_reduce_shared_memory_bank_conflicts = true>
// OPT-IN-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-// OPT-IN-SAME: no_reduce_shared_memory_bank_conflicts
#pipeline_layout = #hal.pipeline.layout<bindings = [
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>,
@@ -48,7 +48,7 @@
func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32()
attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {
mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>,
- no_reduce_shared_memory_bank_conflicts // Disable the 'reduceSharedMemoryBankConflicts' pass.
+ gpu_pipeline_options = #iree_gpu.pipeline_options<no_reduce_shared_memory_bank_conflicts = true> // Disable the 'reduceSharedMemoryBankConflicts' pass.
}>} {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
@@ -81,15 +81,15 @@
// -----
-// Check that applying the `reorder_workgroups = transpose` unit attribute enables workgroup reordering.
+// Check that applying the `reorder_workgroups_strategy = Transpose` pipeline option attribute enables workgroup reordering.
// OPT-OUT: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+// OPT-OUT-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>
// OPT-OUT-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-// OPT-OUT-SAME: reorder_workgroups = "transpose"
// OPT-IN: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+// OPT-IN-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>
// OPT-IN-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-// OPT-IN-SAME: reorder_workgroups = "transpose"
#pipeline_layout = #hal.pipeline.layout<bindings = [
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>,
@@ -123,7 +123,7 @@
func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32()
attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {
mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>,
- reorder_workgroups = "transpose" // enable the 'reorderWorkgroups' pass.
+ gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose> // enable the 'reorderWorkgroups' pass.
}>} {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
@@ -155,11 +155,11 @@
}
// -----
-// Check that applying the `reorder_workgroups = none` unit attribute disables workgroup reordering.
+// Check that applying the `reorder_workgroups_strategy = None` pipeline option disables workgroup reordering.
// OPT-OUT: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+// OPT-OUT-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = None>
// OPT-OUT-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-// OPT-OUT-SAME: reorder_workgroups = "none"
#pipeline_layout = #hal.pipeline.layout<bindings = [
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>,
@@ -182,7 +182,7 @@
func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32()
attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {
mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>,
- reorder_workgroups = "none" // Disable the 'reorderWorkgroups' pass.
+ gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = None> // Disable the 'reorderWorkgroups' pass.
}>} {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir
index 03a48cc..d232c9a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir
@@ -345,7 +345,7 @@
// CHECK: #[[$TILE_SIZES:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 32, 0, 64, 64]{{\]}}
// CHECK: #iree_codegen.translation_info<LLVMGPUVectorDistribute
// CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 1
-// CHECK-NOT: prefetch_shared_memory
+// CHECK-NOT: prefetch_shared_memory = true
// CHECK-LABEL: func.func @attention_20x4096x64x4096x64()
@@ -380,7 +380,7 @@
// CHECK: #[[$TILE_SIZES:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[16, 0, 32, 16]{{\]}}
// CHECK: #iree_codegen.translation_info<LLVMGPUVectorDistribute
// CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 1
-// CHECK-NOT: prefetch_shared_memory
+// CHECK-NOT: prefetch_shared_memory = true
// CHECK-LABEL: func.func @attention_large_head_dim_shared_mem()
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
index dfcf4d9..7aa6e83 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
@@ -46,9 +46,9 @@
// Basic pipeline test to make sure it generates the instructions we expect.
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+// CHECK-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
-// CHECK-SAME: prefetch_shared_memory
// CHECK-LABEL: func.func @matmul_256x256x256_f16_f32()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
@@ -93,9 +93,9 @@
}
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+// CHECK-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
-// CHECK-SAME: prefetch_shared_memory
// CHECK-LABEL: func.func @matmul_256x256x256_f16_f16()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
@@ -160,7 +160,7 @@
}
// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64
-// CHECK-SAME: prefetch_shared_memory
+// CHECK-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
// CHECK: func @expanded_matmul_transpose_b
// CHECK-SAME: translation_info = #[[TRANSLATION]]
@@ -212,9 +212,9 @@
// Make sure it generates the mfma instructions we expect for f8 inputs.
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+// CHECK-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
-// CHECK-SAME: prefetch_shared_memory
// CHECK-LABEL: func.func @matmul_256x256x256_f8_f32()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
@@ -261,9 +261,9 @@
// Make sure it generates the mfma instructions we expect for integer inputs.
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+// CHECK-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>,
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
-// CHECK-SAME: prefetch_shared_memory
// CHECK-LABEL: func.func @matmul_256x256x256_i8_i32()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
@@ -310,9 +310,9 @@
// Make sure it generates the mfma instructions we expect for integer inputs.
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+// CHECK-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>,
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
-// CHECK-SAME: prefetch_shared_memory
// CHECK-LABEL: func.func @matmul_transpose_b_256x256x256_i8_i32()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
@@ -413,9 +413,9 @@
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+// CHECK-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
-// CHECK-SAME: prefetch_shared_memory
// CHECK-LABEL: func.func @generic_2x1024x20x64x1280_f16
// This has more than 2 iteartions. So we have prefetching enabled for this case. Due to
@@ -462,9 +462,9 @@
}
// RDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32
+// RDNA3-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
// RDNA3-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>,
// RDNA3-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
-// RDNA3-SAME: prefetch_shared_memory
// RDNA3-LABEL: func.func @matmul_256x256x256_f16_f32
// RDNA3-SAME: translation_info = #[[$TRANSLATION]]
@@ -512,9 +512,9 @@
}
// RDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32
+// RDNA3-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
// RDNA3-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F16>,
// RDNA3-SAME: subgroup_m_count = 2, subgroup_n_count = 2>
-// RDNA3-SAME: prefetch_shared_memory
// RDNA3-LABEL: func.func @matmul_256x256x256_f16_f16
// RDNA3-SAME: translation_info = #[[$TRANSLATION]]
@@ -563,9 +563,9 @@
// Basic pipeline test to make sure it generates the instructions we expect.
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64
+// CHECK-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
// CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 1>
-// CHECK-SAME: prefetch_shared_memory
// CHECK-LABEL: func.func @unaligned_nk_batch_matmul()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
@@ -649,9 +649,9 @@
// Basic pipeline test to make sure it generates the instructions we expect.
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64
+// CHECK-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
// CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 4>
-// CHECK-SAME: prefetch_shared_memory
// CHECK-LABEL: func.func @contract_schedule_considering_read_layout()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
@@ -709,7 +709,7 @@
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
// CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 1>
// Prefetching is disabled for attention for now
-// CHECK-NOT: prefetch_shared_memory
+// CHECK-NOT: gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
// CHECK-LABEL: func.func @attention_20x4096x64x4096x64()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]