[mlir][amdgpu] Replaced `nullopt` with target arch chipset in `populateGpuPromoteShuffleToAMDGPUPatterns` pass (#21799)

* replaced `std::nullopt` with arch `chipset` in
`populateGpuPromoteShuffleToAMDGPUPatterns` pass
* added lit test for the same

---------

Signed-off-by: xintin <gaurav.verma@amd.com>
Co-authored-by: Jakub Kuderski <kubakuderski@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
index 2d49774..e110e82 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
@@ -202,6 +202,14 @@
     }
     bool use32BitIndices = clROCMIndexingBits == 32;
 
+    StringRef targetArch = getGPUTargetAttr(m).getArch();
+    FailureOr<amdgpu::Chipset> maybeChipset =
+        amdgpu::Chipset::parse(targetArch);
+    if (failed(maybeChipset)) {
+      m.emitOpError() << "Invalid chipset name: " << targetArch;
+      return signalPassFailure();
+    }
+
     /// Customize the bitwidth used for the device side index computations.
     LowerToLLVMOptions options(m.getContext(), DataLayout(m));
     options.overrideIndexBitwidth(use32BitIndices ? 32 : 64);
@@ -230,12 +238,6 @@
               vector::VectorContractLowering::OuterProduct);
       // These patterns only convert a subset of arith that target specific
       // rocdl intrinsics (e.g. fp8 conversions).
-      StringRef chipset = getGPUTargetAttr(m).getArch();
-      FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
-      if (failed(maybeChipset)) {
-        m.emitOpError() << "Invalid chipset name: " << chipset;
-        return signalPassFailure();
-      }
       WalkResult allTypesValid = m.walk([&](Operation *op) {
         if (failed(validateDataTypes(op, *maybeChipset))) {
           return WalkResult::interrupt();
@@ -300,7 +302,7 @@
     {
       RewritePatternSet patterns(&getContext());
       populateGpuRewritePatterns(patterns);
-      populateGpuPromoteShuffleToAMDGPUPatterns(patterns, std::nullopt);
+      populateGpuPromoteShuffleToAMDGPUPatterns(patterns, maybeChipset);
       populateGpuSubgroupIdPatterns(patterns);
       if (failed(applyPatternsGreedily(m, std::move(patterns)))) {
         return signalPassFailure();
@@ -335,10 +337,8 @@
       populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
       cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
       arith::populateArithToLLVMConversionPatterns(converter, llvmPatterns);
-      StringRef targetArch = getGPUTargetAttr(m).getArch();
-      amdgpu::Chipset chipset =
-          amdgpu::Chipset::parse(targetArch).value_or(amdgpu::Chipset());
-      populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns, chipset);
+      populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
+                                              *maybeChipset);
       vector::populateVectorRankReducingFMAPattern(llvmPatterns);
       vector::populateVectorInsertExtractStridedSliceTransforms(llvmPatterns);
       vector::populateVectorStepLoweringPatterns(llvmPatterns);
@@ -346,8 +346,8 @@
       populateVectorToLLVMConversionPatterns(converter, llvmPatterns);
       vector::populateVectorTransferLoweringPatterns(llvmPatterns,
                                                      /*maxTransferRank=*/1);
-      populateGpuToROCDLConversionPatterns(converter, llvmPatterns,
-                                           gpu::amd::Runtime::Unknown, chipset);
+      populateGpuToROCDLConversionPatterns(
+          converter, llvmPatterns, gpu::amd::Runtime::Unknown, *maybeChipset);
       LLVMConversionTarget target(getContext());
       populateFuncToLLVMFuncOpConversionPattern(converter, llvmPatterns);
       configureGpuToROCDLConversionLegality(target);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index 43106e2..5c1a1ba 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -23,6 +23,7 @@
             "conv_pipeline_test_cuda.mlir",
             "convert_to_nvvm.mlir",
             "convert_to_rocdl.mlir",
+            "convert_to_rocdl_gfx950.mlir",
             "create_async_groups.mlir",
             "create_tile_sizes.mlir",
             "distribute_to_thread.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index edb6d23..87f644b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -29,6 +29,7 @@
     "conv_pipeline_test_cuda.mlir"
     "convert_to_nvvm.mlir"
     "convert_to_rocdl.mlir"
+    "convert_to_rocdl_gfx950.mlir"
     "create_async_groups.mlir"
     "create_tile_sizes.mlir"
     "distribute_to_thread.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl_gfx950.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl_gfx950.mlir
new file mode 100644
index 0000000..b6aa502
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl_gfx950.mlir
@@ -0,0 +1,31 @@
+// RUN: iree-opt --iree-gpu-test-target=gfx950 --iree-convert-to-rocdl %s | FileCheck --check-prefix=CHECK-PERMLANE %s
+
+// Test permlane lowering on gfx950.
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>
+]>
+module {
+  func.func @test_permlane_16_32_lowering() {
+    %c0  = arith.constant 0 : index
+    %out = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<1xi32>
+
+    %tid = gpu.thread_id x
+    %val = arith.index_castui %tid : index to i32
+
+    // Emits rocdl.permlane*.swap on gfx950.
+    %p32 = amdgpu.permlane_swap %val 32 : i32
+    %a32 = arith.addi %val, %p32 : i32
+    %p16 = amdgpu.permlane_swap %a32 16 : i32
+    %sum = arith.addi %a32, %p16 : i32
+
+    %is0 = arith.cmpi eq, %tid, %c0 : index
+    scf.if %is0 {
+      memref.store %sum, %out[%c0] : memref<1xi32>
+    }
+    return
+  }
+}
+
+// CHECK-PERMLANE-LABEL: llvm.func @test_permlane_16_32_lowering
+// CHECK-PERMLANE: rocdl.permlane32.swap
+// CHECK-PERMLANE: rocdl.permlane16.swap