[mlir][amdgpu] Replaced `nullopt` with target arch chipset in `populateGpuPromoteShuffleToAMDGPUPatterns` pass (#21799)
* replaced `std::nullopt` with arch `chipset` in
`populateGpuPromoteShuffleToAMDGPUPatterns` pass
* added lit test for the same
---------
Signed-off-by: xintin <gaurav.verma@amd.com>
Co-authored-by: Jakub Kuderski <kubakuderski@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
index 2d49774..e110e82 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
@@ -202,6 +202,14 @@
}
bool use32BitIndices = clROCMIndexingBits == 32;
+ StringRef targetArch = getGPUTargetAttr(m).getArch();
+ FailureOr<amdgpu::Chipset> maybeChipset =
+ amdgpu::Chipset::parse(targetArch);
+ if (failed(maybeChipset)) {
+ m.emitOpError() << "Invalid chipset name: " << targetArch;
+ return signalPassFailure();
+ }
+
/// Customize the bitwidth used for the device side index computations.
LowerToLLVMOptions options(m.getContext(), DataLayout(m));
options.overrideIndexBitwidth(use32BitIndices ? 32 : 64);
@@ -230,12 +238,6 @@
vector::VectorContractLowering::OuterProduct);
// These patterns only convert a subset of arith that target specific
// rocdl intrinsics (e.g. fp8 conversions).
- StringRef chipset = getGPUTargetAttr(m).getArch();
- FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
- if (failed(maybeChipset)) {
- m.emitOpError() << "Invalid chipset name: " << chipset;
- return signalPassFailure();
- }
WalkResult allTypesValid = m.walk([&](Operation *op) {
if (failed(validateDataTypes(op, *maybeChipset))) {
return WalkResult::interrupt();
@@ -300,7 +302,7 @@
{
RewritePatternSet patterns(&getContext());
populateGpuRewritePatterns(patterns);
- populateGpuPromoteShuffleToAMDGPUPatterns(patterns, std::nullopt);
+ populateGpuPromoteShuffleToAMDGPUPatterns(patterns, maybeChipset);
populateGpuSubgroupIdPatterns(patterns);
if (failed(applyPatternsGreedily(m, std::move(patterns)))) {
return signalPassFailure();
@@ -335,10 +337,8 @@
populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
arith::populateArithToLLVMConversionPatterns(converter, llvmPatterns);
- StringRef targetArch = getGPUTargetAttr(m).getArch();
- amdgpu::Chipset chipset =
- amdgpu::Chipset::parse(targetArch).value_or(amdgpu::Chipset());
- populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns, chipset);
+ populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
+ *maybeChipset);
vector::populateVectorRankReducingFMAPattern(llvmPatterns);
vector::populateVectorInsertExtractStridedSliceTransforms(llvmPatterns);
vector::populateVectorStepLoweringPatterns(llvmPatterns);
@@ -346,8 +346,8 @@
populateVectorToLLVMConversionPatterns(converter, llvmPatterns);
vector::populateVectorTransferLoweringPatterns(llvmPatterns,
/*maxTransferRank=*/1);
- populateGpuToROCDLConversionPatterns(converter, llvmPatterns,
- gpu::amd::Runtime::Unknown, chipset);
+ populateGpuToROCDLConversionPatterns(
+ converter, llvmPatterns, gpu::amd::Runtime::Unknown, *maybeChipset);
LLVMConversionTarget target(getContext());
populateFuncToLLVMFuncOpConversionPattern(converter, llvmPatterns);
configureGpuToROCDLConversionLegality(target);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index 43106e2..5c1a1ba 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -23,6 +23,7 @@
"conv_pipeline_test_cuda.mlir",
"convert_to_nvvm.mlir",
"convert_to_rocdl.mlir",
+ "convert_to_rocdl_gfx950.mlir",
"create_async_groups.mlir",
"create_tile_sizes.mlir",
"distribute_to_thread.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index edb6d23..87f644b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -29,6 +29,7 @@
"conv_pipeline_test_cuda.mlir"
"convert_to_nvvm.mlir"
"convert_to_rocdl.mlir"
+ "convert_to_rocdl_gfx950.mlir"
"create_async_groups.mlir"
"create_tile_sizes.mlir"
"distribute_to_thread.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl_gfx950.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl_gfx950.mlir
new file mode 100644
index 0000000..b6aa502
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl_gfx950.mlir
@@ -0,0 +1,31 @@
+// RUN: iree-opt --iree-gpu-test-target=gfx950 --iree-convert-to-rocdl %s | FileCheck --check-prefix=CHECK-PERMLANE %s
+
+// Test permlane lowering on gfx950.
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+ #hal.pipeline.binding<storage_buffer>
+]>
+module {
+ func.func @test_permlane_16_32_lowering() {
+ %c0 = arith.constant 0 : index
+ %out = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<1xi32>
+
+ %tid = gpu.thread_id x
+ %val = arith.index_castui %tid : index to i32
+
+ // Emits rocdl.permlane*.swap on gfx950.
+ %p32 = amdgpu.permlane_swap %val 32 : i32
+ %a32 = arith.addi %val, %p32 : i32
+ %p16 = amdgpu.permlane_swap %a32 16 : i32
+ %sum = arith.addi %a32, %p16 : i32
+
+ %is0 = arith.cmpi eq, %tid, %c0 : index
+ scf.if %is0 {
+ memref.store %sum, %out[%c0] : memref<1xi32>
+ }
+ return
+ }
+}
+
+// CHECK-PERMLANE-LABEL: llvm.func @test_permlane_16_32_lowering
+// CHECK-PERMLANE: rocdl.permlane32.swap
+// CHECK-PERMLANE: rocdl.permlane16.swap