Add gpu::BarrierOp conversion for ROCDL (#15570)

Added the gpu::BarrierOp conversion from amdgpu to rocdl and from rocdl
to llvm.

Co-authored-by: Harsh Menon <harsh@nod-labs.com>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
index f1746e4..60e518b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
@@ -93,6 +93,10 @@
         "//llvm-external-projects/iree-dialects:IREELinalgTransformDialect",
         "//llvm-external-projects/iree-dialects:IREELinalgTransformDialectPasses",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AMDGPUDialect",
+        "@llvm-project//mlir:AMDGPUToROCDL",
+        "@llvm-project//mlir:AMDGPUTransforms",
+        "@llvm-project//mlir:AMDGPUUtils",
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:AffineToStandard",
         "@llvm-project//mlir:Analysis",
@@ -131,6 +135,8 @@
         "@llvm-project//mlir:PDLInterpDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ROCDLDialect",
+        "@llvm-project//mlir:ROCDLTarget",
+        "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFToControlFlow",
         "@llvm-project//mlir:SCFTransforms",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
index 8aa547d..025ed5d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
@@ -72,6 +72,10 @@
     IREELinalgTransformDialect
     IREELinalgTransformDialectPasses
     LLVMSupport
+    MLIRAMDGPUDialect
+    MLIRAMDGPUToROCDL
+    MLIRAMDGPUTransforms
+    MLIRAMDGPUUtils
     MLIRAffineDialect
     MLIRAffineToStandard
     MLIRAnalysis
@@ -109,6 +113,8 @@
     MLIRPDLInterpDialect
     MLIRPass
     MLIRROCDLDialect
+    MLIRROCDLTarget
+    MLIRROCDLToLLVMIRTranslation
     MLIRSCFDialect
     MLIRSCFToControlFlow
     MLIRSCFTransforms
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
index a3ab761..f7d33f8 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
@@ -11,6 +11,7 @@
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
+#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
@@ -22,7 +23,9 @@
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
@@ -36,6 +39,49 @@
 
 namespace {
 
+static StringRef getTargetArch(func::FuncOp entryPoint) {
+  if (auto variantOp =
+          entryPoint->getParentOfType<IREE::HAL::ExecutableVariantOp>()) {
+    IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.getTarget();
+    if (auto config = targetAttr.getConfiguration()) {
+      if (auto attr = config.getAs<StringAttr>("target_arch")) {
+        return attr.getValue();
+      }
+    }
+  }
+  return "";
+}
+
+static StringRef getChipset(ModuleOp m) {
+  for (func::FuncOp funcOp : m.getOps<func::FuncOp>()) {
+    if (isEntryPoint(funcOp)) {
+      return getTargetArch(funcOp);
+    }
+  }
+  return "";
+}
+
+// Transform gpu.barrier -> amdgpu.lds_barrier
+// IREE code generation currently only ever needs to synchronize for
+// LDS operations. This conversion is to make the barrier operations
+// LDS specific because the gpu.barrier contains global memory
+// operations as well.
+struct ReplaceGPUBarrierWithLDSBarrier
+    : public OpRewritePattern<gpu::BarrierOp> {
+  using OpRewritePattern<gpu::BarrierOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(gpu::BarrierOp op,
+                                PatternRewriter &rewriter) const override {
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.replaceOpWithNewOp<amdgpu::LDSBarrierOp>(op);
+    return success();
+  }
+};
+
+void populateConvertGPUToAMDGPUPatterns(RewritePatternSet &patterns) {
+  patterns.add<ReplaceGPUBarrierWithLDSBarrier>(patterns.getContext());
+}
+
 /// A pass that replaces all occurrences of GPU device operations with their
 /// corresponding ROCDL equivalent.
 ///
@@ -43,7 +89,8 @@
 /// code.
 struct ConvertToROCDLPass : public ConvertToROCDLBase<ConvertToROCDLPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<LLVM::LLVMDialect, ROCDL::ROCDLDialect>();
+    registry.insert<LLVM::LLVMDialect, ROCDL::ROCDLDialect,
+                    amdgpu::AMDGPUDialect, gpu::GPUDialect>();
   }
   void runOnOperation() override {
     ModuleOp m = getOperation();
@@ -71,6 +118,7 @@
     // Run Vector -> Vector transformations ahead of conversion to LLVM.
     {
       RewritePatternSet patterns(&getContext());
+      populateConvertGPUToAMDGPUPatterns(patterns);
       populateDropSharedMemoryDeallocOpPatterns(patterns);
       populateScalarizeMathOps(patterns);
       populateConvertSharedMemoryAllocOps(patterns);
@@ -120,6 +168,10 @@
       populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
       cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
       arith::populateArithToLLVMConversionPatterns(converter, llvmPatterns);
+      StringRef chipset = getChipset(m);
+      FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
+      populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
+                                              *maybeChipset);
       populateVectorToLLVMConversionPatterns(converter, llvmPatterns);
       populateGpuToROCDLConversionPatterns(converter, llvmPatterns,
                                            gpu::amd::Runtime::Unknown);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir
index 6273a8d..fd52a45 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir
@@ -73,3 +73,28 @@
 }
 // CHECK-LABEL: llvm.func @reduction_maximum
 // CHECK:  llvm.intr.vector.reduce.fmax({{.*}})  : (vector<2xf32>) -> f32
+
+// -----
+// Test that gpu barriers be lowered to `s_waitcnt lgkmcnt(0)\0As_barrier` on rocm
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<4, storage_buffer>
+  ]>,
+  #hal.descriptor_set.layout<1, bindings = [
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+hal.executable @matmul_dispatch_0 {
+  hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export @matmul_dispatch_0 layout(#pipeline_layout)
+    builtin.module {
+      func.func @matmul_dispatch_0() {
+        gpu.barrier
+        return
+      }
+    }
+  }
+}
+// CHECK-LABEL: llvm.func @matmul_dispatch_0
+// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "s_waitcnt lgkmcnt(0)\0As_barrier", ""  : () -> ()
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/BUILD.bazel b/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/BUILD.bazel
index 5309016..8818485 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/BUILD.bazel
@@ -43,11 +43,16 @@
         "@llvm-project//llvm:MC",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
+        "@llvm-project//mlir:AMDGPUDialect",
+        "@llvm-project//mlir:AMDGPUToROCDL",
+        "@llvm-project//mlir:AMDGPUTransforms",
+        "@llvm-project//mlir:AMDGPUUtils",
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ROCDLDialect",
+        "@llvm-project//mlir:ROCDLTarget",
         "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:ToLLVMIRTranslation",
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/CMakeLists.txt
index b0b2e6a..e64b04a 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/CMakeLists.txt
@@ -31,11 +31,16 @@
     LLVMSupport
     LLVMTarget
     LLVMipo
+    MLIRAMDGPUDialect
+    MLIRAMDGPUToROCDL
+    MLIRAMDGPUTransforms
+    MLIRAMDGPUUtils
     MLIRBuiltinToLLVMIRTranslation
     MLIRLLVMDialect
     MLIRLLVMToLLVMIRTranslation
     MLIRPass
     MLIRROCDLDialect
+    MLIRROCDLTarget
     MLIRROCDLToLLVMIRTranslation
     MLIRSupport
     MLIRTargetLLVMIRExport
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/ROCMTarget.cpp b/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/ROCMTarget.cpp
index db6a7fe..3aa6fe8 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/ROCMTarget.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/ROCMTarget.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Support/LogicalResult.h"
@@ -85,6 +86,7 @@
     mlir::registerLLVMDialectTranslation(registry);
     mlir::registerROCDLDialectTranslation(registry);
     registry.insert<IREE::Codegen::IREECodegenDialect>();
+    registry.insert<amdgpu::AMDGPUDialect>();
   }
 
   IREE::HAL::DeviceTargetAttr
@@ -320,6 +322,7 @@
         LLVMInitializeAMDGPUTarget();
         LLVMInitializeAMDGPUTargetMC();
         LLVMInitializeAMDGPUTargetInfo();
+        LLVMInitializeAMDGPUAsmParser();
         LLVMInitializeAMDGPUAsmPrinter();
         return std::make_shared<ROCMTargetBackend>();
       });